I am getting the following errors when attempting to present a dataframe in Streamlit
UnicodeEncodeError: 'utf-8' codec can't encode character '\udccf' in position 2: surrogates not allowed
Traceback:
File "c:\streamlit\script_runner.py", line 350, in _run_script
exec(code, module.__dict__)
File "C:Analytics\logAnalytics.py", line 121, in <module>
st.dataframe(dfMain)
File "c:\anaconda3\lib\site-packages\streamlit\elements\dataframe_selector.py", line 85, in dataframe
return self.dg._arrow_dataframe(data, width, height)
File "conda3\lib\site-packages\streamlit\elements\arrow.py", line 82, in _arrow_dataframe
marshall(proto, data, default_uuid)
File "cconda3\lib\site-packages\streamlit\elements\arrow.py", line 160, in marshall
proto.data = type_util.data_frame_to_bytes(df)
File "nda3\lib\site-packages\streamlit\type_util.py", line 371, in data_frame_to_bytes
table = pa.Table.from_pandas(df)
File "pyarrow\table.pxi", line 1561, in pyarrow.lib.Table.from_pandas
File "nda3\lib\site-packages\pyarrow\pandas_compat.py", line 607, in dataframe_to_arrays
arrays[i] = maybe_fut.result()
File "aconda3\lib\concurrent\futures\_base.py", line 432, in result
return self.__get_result()
File "conda3\lib\concurrent\futures\_base.py", line 388, in __get_result
raise self._exception
File "\anaconda3\lib\concurrent\futures\thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File \anaconda3\lib\site-packages\pyarrow\pandas_compat.py", line 575, in convert_column
result = pa.array(col, type=type_, from_pandas=True, safe=safe)
File "pyarrow\array.pxi", line 302, in pyarrow.lib.array
File "pyarrow\array.pxi", line 83, in pyarrow.lib._ndarray_to_array
I understand that there is a unicode character somewhere, but simply cannot find where there is one. I have removed all of the records with unicode characters. I can write to_csv properly and not run into a unicode issue. Is there something Streamlit cannot handle that I am missing? here is most of the code:
# Next up, we get the data from main logs and create a dataframe
mainLogs = []
keyword = 'main'
for fname in os.listdir(cwd):
if keyword in fname:
mainLogs.append(fname)
mainLogs = [item for item in mainLogs if not item.endswith('.gz')]
date = []
time = []
processID = []
threadID = []
priority = []
app = []
tagsText = []
readLine = []
for main in mainLogs:
with open(main,encoding='utf8',errors='surrogateescape',newline='\n') as logs:
try:
for line in logs:
lines = line.split()
#for debugging
readLine.append(lines)
date.append(lines[0])
time.append(lines[1])
processID.append(lines[2])
threadID.append(lines[3])
priority.append(lines[4])
app.append(lines[5])
tagsText.append(lines[6:])
except IndexError:
pass
#Combine everything into a final junk field that is hard to parse easily. May revisit later.
tagsTextComb = []
for innerlist in tagsText:
tagsTextComb.append(' '.join(innerlist)+" ")
#Do some basic checking on the length of the lists
print("length of Date"+' '+str(len(date)))
print("length of Time"+' '+str(len(time)))
print("length of processID"+' '+str(len(processID)))
print("length of threadID"+' '+str(len(threadID)))
print("length of priority"+' '+str(len(priority)))
print("length of app"+' '+str(len(app)))
print("length of tagsText"+' '+str(len(tagsText)))
print("length of tagsTextComb"+' '+str(len(tagsTextComb)))
#Create a dictionary that combines the parsed lists together for processing in a df
mainDict = {'date': date, 'time': time,'processID':processID,'threadID':threadID,'priority':priority,'app':app,'tagsText':tagsTextComb}
#Call the function that will even up the lists with x to make sure they are all the same len
pad_dict_list(mainDict,'x')
#Create the dfMain
dfMain = pd.DataFrame(mainDict)
#Add the source name to the df
dfMain['source'] = 'main'
def clean_text(row):
return row.replace('[^\x00-\x7F]', "")
dfMain['tagsText'] = dfMain['tagsText'].apply(clean_text)
dfMain['date'] = dfMain['date'].apply(clean_text)
dfMain['time'] = dfMain['time'].apply(clean_text)
dfMain['processID'] = dfMain['processID'].apply(clean_text)
dfMain['threadID'] = dfMain['threadID'].apply(clean_text)
dfMain['priority'] = dfMain['priority'].apply(clean_text)
dfMain['app '] = dfMain['app'].apply(clean_text)
dfMain.drop(dfMain[dfMain['tagsText'].str.contains("")].index, inplace=True)
st.dataframe(dfMain)
st.table(dfMain.iloc[0:10])