Unicode Error when Rendering Table in Streamlit App

I am getting the following errors when attempting to present a dataframe in Streamlit

UnicodeEncodeError: 'utf-8' codec can't encode character '\udccf' in position 2: surrogates not allowed
Traceback:
File "c:\streamlit\script_runner.py", line 350, in _run_script
    exec(code, module.__dict__)
File "C:Analytics\logAnalytics.py", line 121, in <module>
    st.dataframe(dfMain)
File "c:\anaconda3\lib\site-packages\streamlit\elements\dataframe_selector.py", line 85, in dataframe
    return self.dg._arrow_dataframe(data, width, height)
File "conda3\lib\site-packages\streamlit\elements\arrow.py", line 82, in _arrow_dataframe
    marshall(proto, data, default_uuid)
File "cconda3\lib\site-packages\streamlit\elements\arrow.py", line 160, in marshall
    proto.data = type_util.data_frame_to_bytes(df)
File "nda3\lib\site-packages\streamlit\type_util.py", line 371, in data_frame_to_bytes
    table = pa.Table.from_pandas(df)
File "pyarrow\table.pxi", line 1561, in pyarrow.lib.Table.from_pandas
File "nda3\lib\site-packages\pyarrow\pandas_compat.py", line 607, in dataframe_to_arrays
    arrays[i] = maybe_fut.result()
File "aconda3\lib\concurrent\futures\_base.py", line 432, in result
    return self.__get_result()
File "conda3\lib\concurrent\futures\_base.py", line 388, in __get_result
    raise self._exception
File "\anaconda3\lib\concurrent\futures\thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
File \anaconda3\lib\site-packages\pyarrow\pandas_compat.py", line 575, in convert_column
    result = pa.array(col, type=type_, from_pandas=True, safe=safe)
File "pyarrow\array.pxi", line 302, in pyarrow.lib.array
File "pyarrow\array.pxi", line 83, in pyarrow.lib._ndarray_to_array

I understand that there is a unicode character somewhere, but simply cannot find where there is one. I have removed all of the records with unicode characters. I can write to_csv properly and not run into a unicode issue. Is there something Streamlit cannot handle that I am missing? here is most of the code:

# Next up, we get the data from main logs and create a dataframe
mainLogs = []        
keyword = 'main'
for fname in os.listdir(cwd):
    if keyword in fname:
        mainLogs.append(fname)  
        
mainLogs = [item for item in mainLogs if not item.endswith('.gz')]
    
date = []
time = []
processID = []
threadID = []
priority = []
app = []
tagsText = []
readLine = []

for main in mainLogs:
    with open(main,encoding='utf8',errors='surrogateescape',newline='\n') as logs:
        try:
            for line in logs:
                lines = line.split()
                #for debugging
                readLine.append(lines)
                date.append(lines[0])
                time.append(lines[1])
                processID.append(lines[2])
                threadID.append(lines[3])
                priority.append(lines[4])
                app.append(lines[5])
                tagsText.append(lines[6:])
        except IndexError:
             pass

#Combine everything into a final junk field that is hard to parse easily. May revisit later.
tagsTextComb = []
for innerlist in tagsText:
    tagsTextComb.append(' '.join(innerlist)+" ")

#Do some basic checking on the length of the lists
print("length of Date"+' '+str(len(date)))
print("length of Time"+' '+str(len(time)))
print("length of processID"+' '+str(len(processID)))
print("length of threadID"+' '+str(len(threadID)))
print("length of priority"+' '+str(len(priority)))
print("length of app"+' '+str(len(app)))
print("length of tagsText"+' '+str(len(tagsText)))
print("length of tagsTextComb"+' '+str(len(tagsTextComb)))

#Create a dictionary that combines the parsed lists together for processing in a df
mainDict = {'date': date, 'time': time,'processID':processID,'threadID':threadID,'priority':priority,'app':app,'tagsText':tagsTextComb}

#Call the function that will even up the lists with x to make sure they are all the same len
pad_dict_list(mainDict,'x')

#Create the dfMain
dfMain = pd.DataFrame(mainDict)

#Add the source name to the df
dfMain['source'] = 'main'

def clean_text(row):
    return row.replace('[^\x00-\x7F]', "")

dfMain['tagsText'] = dfMain['tagsText'].apply(clean_text)
dfMain['date'] = dfMain['date'].apply(clean_text)
dfMain['time'] = dfMain['time'].apply(clean_text)
dfMain['processID'] = dfMain['processID'].apply(clean_text)
dfMain['threadID'] = dfMain['threadID'].apply(clean_text)
dfMain['priority'] = dfMain['priority'].apply(clean_text)
dfMain['app '] = dfMain['app'].apply(clean_text)
dfMain.drop(dfMain[dfMain['tagsText'].str.contains("")].index, inplace=True)

st.dataframe(dfMain)
st.table(dfMain.iloc[0:10])

Changed the error handling when opening the file to β€˜ignore’. Not entirely sure how much data loss that will cause but maybe unicode will not be written to the log files anymore.