Unicode Error when Rendering Table in Streamlit App

I am getting the following errors when attempting to present a dataframe in Streamlit

UnicodeEncodeError: 'utf-8' codec can't encode character '\udccf' in position 2: surrogates not allowed
Traceback:
File "c:\streamlit\script_runner.py", line 350, in _run_script
    exec(code, module.__dict__)
File "C:Analytics\logAnalytics.py", line 121, in <module>
    st.dataframe(dfMain)
File "c:\anaconda3\lib\site-packages\streamlit\elements\dataframe_selector.py", line 85, in dataframe
    return self.dg._arrow_dataframe(data, width, height)
File "conda3\lib\site-packages\streamlit\elements\arrow.py", line 82, in _arrow_dataframe
    marshall(proto, data, default_uuid)
File "cconda3\lib\site-packages\streamlit\elements\arrow.py", line 160, in marshall
    proto.data = type_util.data_frame_to_bytes(df)
File "nda3\lib\site-packages\streamlit\type_util.py", line 371, in data_frame_to_bytes
    table = pa.Table.from_pandas(df)
File "pyarrow\table.pxi", line 1561, in pyarrow.lib.Table.from_pandas
File "nda3\lib\site-packages\pyarrow\pandas_compat.py", line 607, in dataframe_to_arrays
    arrays[i] = maybe_fut.result()
File "aconda3\lib\concurrent\futures\_base.py", line 432, in result
    return self.__get_result()
File "conda3\lib\concurrent\futures\_base.py", line 388, in __get_result
    raise self._exception
File "\anaconda3\lib\concurrent\futures\thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
File \anaconda3\lib\site-packages\pyarrow\pandas_compat.py", line 575, in convert_column
    result = pa.array(col, type=type_, from_pandas=True, safe=safe)
File "pyarrow\array.pxi", line 302, in pyarrow.lib.array
File "pyarrow\array.pxi", line 83, in pyarrow.lib._ndarray_to_array

I understand that there is a unicode character somewhere, but simply cannot find where there is one. I have removed all of the records with unicode characters. I can write to_csv properly and not run into a unicode issue. Is there something Streamlit cannot handle that I am missing? here is most of the code:

# Next up, we get the data from main logs and create a dataframe
mainLogs = []        
keyword = 'main'
for fname in os.listdir(cwd):
    if keyword in fname:
        mainLogs.append(fname)  
        
mainLogs = [item for item in mainLogs if not item.endswith('.gz')]
    
date = []
time = []
processID = []
threadID = []
priority = []
app = []
tagsText = []
readLine = []

for main in mainLogs:
    with open(main,encoding='utf8',errors='surrogateescape',newline='\n') as logs:
        try:
            for line in logs:
                lines = line.split()
                #for debugging
                readLine.append(lines)
                date.append(lines[0])
                time.append(lines[1])
                processID.append(lines[2])
                threadID.append(lines[3])
                priority.append(lines[4])
                app.append(lines[5])
                tagsText.append(lines[6:])
        except IndexError:
             pass

#Combine everything into a final junk field that is hard to parse easily. May revisit later.
tagsTextComb = []
for innerlist in tagsText:
    tagsTextComb.append(' '.join(innerlist)+" ")

#Do some basic checking on the length of the lists
print("length of Date"+' '+str(len(date)))
print("length of Time"+' '+str(len(time)))
print("length of processID"+' '+str(len(processID)))
print("length of threadID"+' '+str(len(threadID)))
print("length of priority"+' '+str(len(priority)))
print("length of app"+' '+str(len(app)))
print("length of tagsText"+' '+str(len(tagsText)))
print("length of tagsTextComb"+' '+str(len(tagsTextComb)))

#Create a dictionary that combines the parsed lists together for processing in a df
mainDict = {'date': date, 'time': time,'processID':processID,'threadID':threadID,'priority':priority,'app':app,'tagsText':tagsTextComb}

#Call the function that will even up the lists with x to make sure they are all the same len
pad_dict_list(mainDict,'x')

#Create the dfMain
dfMain = pd.DataFrame(mainDict)

#Add the source name to the df
dfMain['source'] = 'main'

def clean_text(row):
    return row.replace('[^\x00-\x7F]', "")

dfMain['tagsText'] = dfMain['tagsText'].apply(clean_text)
dfMain['date'] = dfMain['date'].apply(clean_text)
dfMain['time'] = dfMain['time'].apply(clean_text)
dfMain['processID'] = dfMain['processID'].apply(clean_text)
dfMain['threadID'] = dfMain['threadID'].apply(clean_text)
dfMain['priority'] = dfMain['priority'].apply(clean_text)
dfMain['app '] = dfMain['app'].apply(clean_text)
dfMain.drop(dfMain[dfMain['tagsText'].str.contains("")].index, inplace=True)

st.dataframe(dfMain)
st.table(dfMain.iloc[0:10])

Changed the error handling when opening the file to ‘ignore’. Not entirely sure how much data loss that will cause but maybe unicode will not be written to the log files anymore.