File_uploader woes - Pandas 'read csv' provides a consistent error with parser

After a very happy couple of days using streamlit - managed a couple of running apps and managed to package an offline app pretty cool and not bad for a biologist! first app here Streamlit used as a test case for data viewing.

The goal is an offline app for people to run on their own machines. It doesn’t like providing a path to files like local hosting for nativefier… not sure why… so trying to using file uploader. However, this isn’t working as expected - likely something I have missed…however the documents suggests that UploadFile ‘type’ is a online version of the csv file and should pass direct to pandas. So passing a single file or passing a loop set of files direct to pandas read_csv seems to work I get the normal error when reading these files.

so calling a globbed path to files or URL of raw csv files works.

arrays = ArayaManager(files)  # this normally contains a globbed list of local file names and work if 
                                                 #passing st.text input and through pathlib. 

comp = arrays.concatenate_dataframes() # access the required files. 

passing…

uploaded_files = st.file_uploader("Upload CSV", type='csv', accept_multiple_files=True)

does not work - error concatenate_dataframes takes one argument got 2.
Why is this?
so try

uploaded_files = st.file_uploader('Select files',type=['csv'],accept_multiple_files=True)
file_lst = [uploaded_file.getbuffer for uploaded_file in uploaded_files]
files = st.write(file_lst)

and try passing to ArayaManger - get error - NoneType ‘object’ is not iterable

so I am not passing anything and there is nothing for concatenate_dataframes to work on…

The parser class

class WellDataManager:
    """Parent class for importing data from any instrument"""
    def __init__(self,
                 files,
                 run_name_split_loc=1,
                 group_name=""):
        super().__init__()
        # project attributes
        self.file_names = files
        self.group_name = group_name
        self.run_name = ""
        self.split_char_loc = run_name_split_loc
        self.run_df = pd.DataFrame()
        self.group_df = pd.DataFrame()

        # csv read attributes
        self.tables = 1
        self.index_column = [0]
        self.header_row = [1]
        self.row_count = [8]
        self.column_names = ['Row_ID', 'Col_ID', 'Value']

    def concatenate_dataframes(self):
        for each_file in self.file_names:
            self.get_run_name(each_file)
            self.build_dataframes(each_file)
            self.group_df = pd.concat([self.group_df, self.run_df], ignore_index=True)
        # print(self.group_df)
        return self.group_df

    def build_dataframes(self, each_file):
        self.read_file(each_file)
        self.coerce_numeric_values()
        self.run_df['Group_ID'] = self.group_name
        self.run_df['File_root'] = each_file
        self.run_df['Run_ID'] = self.run_name
        # print(self.run_df)

    def coerce_numeric_values(self):
        # may be used used to force values to numeric.  Not applicable to all instruments
        pass

    def read_file(self, file_name):
        """Reads Initial Data from CSV file"""
        df = pd.read_csv(file_name, header=self.header_row, nrows=self.row_count, index_col=self.index_column)
        df = df.stack()
        self.run_df = df.reset_index()
        self.run_df.columns = self.column_names

    def get_run_name(self, file_name):
        """Splits string to get run name from file name."""
        self.run_name = file_name[:self.split_char_loc]
        self.run_file_in = os.path.basename(csv)
        

class ArtelVMSManager(WellDataManager):
    """Class that handles Well Data Data"""
    def __init__(self,
                 files,
                 run_name_split_loc=1,
                 group_name=""):
        super().__init__(files,
                         run_name_split_loc,
                         group_name)

        # csv read attributes
        self.tables = 1
        self.index_column = [0]
        self.header_row = [18]
        self.row_count = [8]
        self.column_names = ['Row_ID', 'Col_ID', 'Volume']

    def coerce_numeric_values(self):
        """Coerce the 'volume' data to numeric.  Otherwise mixture of strings and numeric values"""
        num_col = self.column_names[2]
        self.run_df[num_col] = pd.to_numeric(self.run_df[num_col], errors='coerce')


class ArayaManager(WellDataManager):
    """Class that handles Well Data Data"""
    def __init__(self,
                 files,
                 run_name_split_loc=6,
                 group_name="",
                 dyes=None,
                 separate_column=True):
        super().__init__(files,
                         run_name_split_loc,
                         group_name)

        if dyes is None:
            dyes = ['FAM', 'VIC', 'ROX']

        # Ayara-specific items
        self.separate_column_per_dye = separate_column
        self.channel_df = pd.DataFrame()
        self.dyes = dyes
        self.channels = ['CH1', 'CH2', 'CH3']

        # csv read attributes
        self.tables = 3
        self.index_column = ["<>", "<>", "<>"]
        self.header_row = [5, 23, 41]
        self.row_count = [16, 16, 16]

        if self.separate_column_per_dye:
            # TODO: generalize for other dye names
            self.column_names = ['Row_ID', 'Col_ID', 'FAM_RFU', 'VIC_RFU', 'ROX_RFU']
        else:
            self.column_names = ['Row_ID', 'Col_ID', 'RFU', 'Channel', 'Dye']

    def read_each_channel(self, file_name, ch):
        """Reads Individual Channel Data from CSV file"""
        df = pd.read_csv(file_name,
                         header=self.header_row[ch],
                         nrows=self.row_count[ch],
                         na_values="<>")

        # Need to shift to get rid of annoying '<>'.  Otherwise won't parse correctly.
        df = df.shift(periods=1, axis='columns')
        #df.drop('<>', axis=1, inplace=True)

        # Stack df for various dyes and add additional columns
        df = df.stack()
        self.channel_df = df.reset_index()

        # For separate columns for each dye, rename RFU columns.  pd.concat() method does the rest!
        if self.separate_column_per_dye:
            self.channel_df.columns = self.column_names[0:3]
            self.channel_df.rename(columns={'FAM_RFU': f'{self.dyes[ch]}_RFU'},
                                   inplace=True)

        # case to stack all dyes into common RFU and Dye channels.
        else:
            self.channel_df['Channel'] = self.channels[ch]
            self.channel_df['Dye'] = self.dyes[ch]
            self.channel_df.columns = self.column_names


    def read_file(self, file_name):
        """Reads Each Channel Data from CSV file"""

        # loops through the 3 channel tables in the csv output files.
        self.run_df = pd.DataFrame()
        for ch in range(self.tables):
            self.read_each_channel(file_name, ch)

            # case to have separate columns for each dye
            if self.separate_column_per_dye:
                self.channel_df = self.channel_df[self.channel_df.columns.difference(self.run_df.columns)]
                self.run_df = pd.concat([self.run_df, self.channel_df], axis=1)

            # case to stack all dyes into common RFU and Dye channels.
            else:
                self.run_df = pd.concat([self.run_df, self.channel_df], ignore_index=True)

        # Force columns to correct order.  Fixed bug with concat of separate dye columns.
        self.run_df = self.run_df[self.column_names]

    def get_run_name(self, file_name):
        """Splits string to get run name from file name."""
        self.run_name = file_name[-(self.split_char_loc+4):-4]`

Not sure why I cannot pass the UploadedFile type data directly to pandas - if I directly pass the file to pd.read_csv it will read single files. A for loop to read each does the same thing. Any help would be appreciated.

Hi @doc_curry, welcome to the Streamlit community!

Is there a GitHub repo you can provide that shows the entirety of the code?

Best,
Randy

This topic was automatically closed 365 days after the last reply. New replies are no longer allowed.