Filter dataframe error

Hello,

I am having hard time with filter error. I am building an app that imports a data csv file, do some calculations, and creates a new dataframe that contains a summary of the analysis. Then using job_filter=st.selectbox(“select”,pd.unique(df[‘test]) where I got an error (attached)

Any advice ? Many thanks in advance

import pandas as pd # read csv, df manipulation
import plotly.express as px # interactive charts
import streamlit as st # data web application development

import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV # Alpha of Lasso
from sklearn import linear_model # Ridge Regression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

from statsmodels.tsa.arima.model import ARIMA

#from tabulate import tabulate
from numpy import arange

st.set_page_config(
page_title=“Real-Time Data Science Dashboard”,
page_icon=“Active”,
layout=“wide”,
)

st.title(‘Macroeconomic Analysis’)
st.markdown(‘This dashboard will help you get more information about the GDP and Reserves and plots the results’)

file=st.file_uploader(“Upload CSV data file”, type=[‘csv’])

if file is not None:
data= pd.read_csv(file)
st.write(data)
data[‘GDP’] = data[‘GDP’].diff()
data.dropna(inplace=True)
data.reset_index(inplace=True, drop=True)
y = data[‘GDP’]
x = data.iloc[:, 2:]

# Split the Sample at hand
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
Reg = LinearRegression()
Reg.fit(x_train, y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
#  k = 10 folds and repeat the cross-validation process 3 times.
model = RidgeCV(alphas=arange(0.1, 10, 1), cv=cv, scoring='neg_mean_absolute_error')
model.fit(x_train, y_train)
model.alpha_
Ridge = linear_model.Ridge(alpha=model.alpha_)
Ridge.fit(x_train, y_train)

Polynomial = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression(fit_intercept=False))])
Polynomial.fit(x_train, y_train)

#  ----- In Sample predictions with each model & Corresponding tests 
y_pred = Reg.predict(x_test)
Reg_r2 = r2_score(y_test, y_pred)
Reg_MSE = mean_squared_error(y_test, y_pred)
# mean_absolute_error(y_test, y_pred)

y_pred = Ridge.predict(x_test)
Ridge_r2 = r2_score(y_test, y_pred)
Ridge_MSE = mean_squared_error(y_test, y_pred)

model_pred = {
    "Linear Regression": {'Reg_MSE': Reg_MSE, 'Reg_r2': Reg_r2},
    "Ridge regression ": {'Ridge_MSE': Ridge_MSE, 'Ridge_r2': Ridge_r2},
    # "Polynomial regression": poly_MSE,
    # "Support Vector Machine": SVR_Prediction,
    # "Random forest regression":Forest_Prediction,
}

Table = [['Test', 'Regression', 'Ridge', 'Polynomial'],
         ['MSE', round(Reg_MSE, 2), round(Ridge_MSE, 2), round(Reg_MSE, 2)],
         ['R-Squared', round(Reg_r2, 2), round(Ridge_r2, 2), round(Reg_r2, 2)]]

del data
df = pd.DataFrame(Table)
st.write(df)


# Create the Filter first:
# top-level filters
job_filter = st.selectbox("Select the Job", pd.unique(df['Test']))
# dataframe filter
df = df[df['Test'] == job_filter]
period=df['date'].iloc[-1]

# KPI
col1, col2, col3=st.columns(3)

with col1:
    st.metric(
        label = 'GDP',
        value = round (df['Ridge'].mean())     # Rounding here eliminates decimals.
)

with col2:
    st.metric(
        label = 'Reserves',
        value = int(df['Regression'].mean())
)

with col3:
    st.metric(
        label = 'FDI',
        value = f'$ {period}'
)

#job_filter = st.selectbox (‘Select the status’, pd.unique (df[‘Polynomial’]))
#d= df [df [‘Polynomial’] == status_filter]

df[‘date’] = pd.to_numeric(df[‘date’])

#df = df.astype({‘date’:‘int’})

fig_col1, fig_col2,fig_col3= st.columns([0.45,0.45,0.1])
# width and length 400 default.

with fig_col1:
    st.markdown("First Chart")
    fig1 = px.histogram(data_frame=df, x="Ridge")
    st.write(fig1)

with fig_col2:
    st.markdown("## Second Chart")
    fig2 = px.histogram(data_frame=df, x="Regression")
    st.write(fig2)

with fig_col1:
    st.markdown("Graph")
    fig3 = px.line(df, x='date', y=df.columns[2:], width=1200)
    st.write(fig3)

Could you post a minimal reproducible code?

Dear Fredy,

Thank you for your reply. Below is the code:

import pandas as pd # read csv, df manipulation
import plotly.express as px # interactive charts
import streamlit as st # data web application development

import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV # Alpha of Lasso
from sklearn import linear_model # Ridge Regression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

from statsmodels.tsa.arima.model import ARIMA

#from tabulate import tabulate
from numpy import arange

st.set_page_config(
page_title=“Real-Time Data Science Dashboard”,
page_icon=“Active”,
layout=“wide”,
)

st.title(‘Macroeconomic Analysis’)
st.markdown(‘This dashboard will help you get more information about the GDP and Reserves and plots the results’)

file=st.file_uploader(“Upload CSV data file”, type=[‘csv’])

if file is not None:
data= pd.read_csv(file)
st.write(data)
data[‘GDP’] = data[‘GDP’].diff()
data.dropna(inplace=True)
data.reset_index(inplace=True, drop=True)
y = data[‘GDP’]
x = data.iloc[:, 2:]

# Split the Sample at hand
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
Reg = LinearRegression()
Reg.fit(x_train, y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
#  k = 10 folds and repeat the cross-validation process 3 times.
model = RidgeCV(alphas=arange(0.1, 10, 1), cv=cv, scoring='neg_mean_absolute_error')
model.fit(x_train, y_train)
model.alpha_
Ridge = linear_model.Ridge(alpha=model.alpha_)
Ridge.fit(x_train, y_train)

Polynomial = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression(fit_intercept=False))])
Polynomial.fit(x_train, y_train)

#  ----- In Sample predictions with each model & Corresponding tests 
y_pred = Reg.predict(x_test)
Reg_r2 = r2_score(y_test, y_pred)
Reg_MSE = mean_squared_error(y_test, y_pred)
# mean_absolute_error(y_test, y_pred)

y_pred = Ridge.predict(x_test)
Ridge_r2 = r2_score(y_test, y_pred)
Ridge_MSE = mean_squared_error(y_test, y_pred)

model_pred = {
    "Linear Regression": {'Reg_MSE': Reg_MSE, 'Reg_r2': Reg_r2},
    "Ridge regression ": {'Ridge_MSE': Ridge_MSE, 'Ridge_r2': Ridge_r2},
    # "Polynomial regression": poly_MSE,
    # "Support Vector Machine": SVR_Prediction,
    # "Random forest regression":Forest_Prediction,
}

Table = [['Test', 'Regression', 'Ridge', 'Polynomial'],
         ['MSE', round(Reg_MSE, 2), round(Ridge_MSE, 2), round(Reg_MSE, 2)],
         ['R-Squared', round(Reg_r2, 2), round(Ridge_r2, 2), round(Reg_r2, 2)]]

del data
df = pd.DataFrame(Table)
st.write(df)


# Create the Filter first:
# top-level filters
job_filter = st.selectbox("Select the Job", pd.unique(df['Test']))
# dataframe filter
df = df[df['Test'] == job_filter]
period=df['date'].iloc[-1]

# KPI
col1, col2, col3=st.columns(3)

with col1:
    st.metric(
        label = 'GDP',
        value = round (df['Ridge'].mean())     # Rounding here eliminates decimals.
)

with col2:
    st.metric(
        label = 'Reserves',
        value = int(df['Regression'].mean())
)

with col3:
    st.metric(
        label = 'FDI',
        value = f'$ {period}'
)

#job_filter = st.selectbox (‘Select the status’, pd.unique (df[‘Polynomial’]))
#d= df [df [‘Polynomial’] == status_filter]

df[‘date’] = pd.to_numeric(df[‘date’])

#df = df.astype({‘date’:‘int’})

fig_col1, fig_col2,fig_col3= st.columns([0.45,0.45,0.1])
# width and length 400 default.

with fig_col1:
    st.markdown("First Chart")
    fig1 = px.histogram(data_frame=df, x="Ridge")
    st.write(fig1)

with fig_col2:
    st.markdown("## Second Chart")
    fig2 = px.histogram(data_frame=df, x="Regression")
    st.write(fig2)

with fig_col1:
    st.markdown("Graph")
    fig3 = px.line(df, x='date', y=df.columns[2:], width=1200)
    st.write(fig3)

And the data file used for uploading is

The error message tells you exactly which line is causing an issue. One guess is that the creation of the unique list shouldn’t be done inside the instantiation of the selectbox, which might happen before all the rest of the code (i.e. df is not actually defined when the selectbox instantiation code is assembled).

You have this code:

Table = [['Test', 'Regression', 'Ridge', 'Polynomial'],
         ['MSE', round(Reg_MSE, 2), round(Ridge_MSE, 2), round(Reg_MSE, 2)],
         ['R-Squared', round(Reg_r2, 2), round(Ridge_r2, 2), round(Reg_r2, 2)]]

del data
df = pd.DataFrame(Table)
st.write(df)

It seems like ['Test', 'Regression', 'Ridge', 'Polynomial'] is supposed to be the header.

You can create a correct df by:

table_header = ['Test', 'Regression', 'Ridge', 'Polynomial']
Table = [
    ['MSE', round(Reg_MSE, 2), round(Ridge_MSE, 2), round(Reg_MSE, 2)],
    ['R-Squared', round(Reg_r2, 2), round(Ridge_r2, 2), round(Reg_r2, 2)]
]

del data
df = pd.DataFrame(Table, columns=table_header)  # Table to df
st.write(df)

And by that, the ‘Test’ column is now defined.

job_filter = st.selectbox("Select the Job", pd.unique(df['Test']))

ferdy I can’t thank you more. Appreciate it .

The code now runs well and produces the required results, however, It is accompanied with the following error:

The code is now reads:

import pandas as pd # read csv, df manipulation
import plotly.express as px # interactive charts
import streamlit as st # data web application development

import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV # Alpha of Lasso
from sklearn import linear_model # Ridge Regression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

from statsmodels.tsa.arima.model import ARIMA

#from tabulate import tabulate
from numpy import arange

st.set_page_config(
page_title=“Real-Time Data Science Dashboard”,
page_icon=“Active”,
layout=“wide”,
)

st.title(‘Macroeconomic Analysis’)
st.markdown(‘This dashboard will help you get more information about the GDP and Reserves and plots the results’)

file=st.file_uploader(“Upload CSV data file”, type=[‘csv’])

if file is not None:
data= pd.read_csv(file)
data[‘GDP’] = data[‘GDP’].diff()
data.dropna(inplace=True)
data.reset_index(inplace=True, drop=True)
y = data[‘GDP’]
x = data.iloc[:, 2:]

# Split the Sample at hand
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
Reg = LinearRegression()
Reg.fit(x_train, y_train)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
#  k = 10 folds and repeat the cross-validation process 3 times.
model = RidgeCV(alphas=arange(0.1, 10, 1), cv=cv, scoring='neg_mean_absolute_error')
model.fit(x_train, y_train)
model.alpha_
Ridge = linear_model.Ridge(alpha=model.alpha_)
Ridge.fit(x_train, y_train)

Polynomial = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression(fit_intercept=False))])
Polynomial.fit(x_train, y_train)

#  ----- In Sample predictions with each model & Corresponding tests -----
y_pred = Reg.predict(x_test)
Reg_r2 = r2_score(y_test, y_pred)
Reg_MSE = mean_squared_error(y_test, y_pred)
# mean_absolute_error(y_test, y_pred)

y_pred = Ridge.predict(x_test)
Ridge_r2 = r2_score(y_test, y_pred)
Ridge_MSE = mean_squared_error(y_test, y_pred)

model_pred = {
    "Linear Regression": {'Reg_MSE': Reg_MSE, 'Reg_r2': Reg_r2},
    "Ridge regression ": {'Ridge_MSE': Ridge_MSE, 'Ridge_r2': Ridge_r2},
    # "Polynomial regression": poly_MSE,
    # "Support Vector Machine": SVR_Prediction,
    # "Random forest regression":Forest_Prediction,
}
table_header = ['Test', 'Regression', 'Ridge', 'Polynomial']
Table = [
    ['MSE', round(Reg_MSE, 2), round(Ridge_MSE, 2), round(Reg_MSE, 2)],
    ['R-Squared', round(Reg_r2, 2), round(Ridge_r2, 2), round(Reg_r2, 2)]
]

del data
df = pd.DataFrame(Table, columns=table_header)  # Table to df
st.write(df)


# Create the Filter first:
# top-level filters
job_filter = st.selectbox("Select the Job", pd.unique(df['Test']))
# dataframe filter
df = df[df['Test'] == job_filter]
period=df['Ridge'].iloc[-1]

# KPI
col1, col2, col3=st.columns(3)

with col1:
    st.metric(
        label = 'GDP',
        value = round (df['Ridge'].mean())     # Rounding here eliminates decimals.
)

with col2:
    st.metric(
        label = 'Reserves',
        value = int(df['Regression'].mean())
)

with col3:
    st.metric(
        label = 'FDI',
        value = f'$ {period}'
)

#job_filter = st.selectbox (‘Select the status’, pd.unique (df[‘Polynomial’]))
#d= df [df [‘Polynomial’] == status_filter]

df[‘date’] = pd.to_numeric(df[‘date’])

#df = df.astype({‘date’:‘int’})

fig_col1, fig_col2,fig_col3= st.columns([0.45,0.45,0.1])
# width and length 400 default.

with fig_col1:
    st.markdown("First Chart")
    fig1 = px.histogram(data_frame=df, x="Ridge")
    st.write(fig1)

with fig_col2:
    st.markdown("## Second Chart")
    fig2 = px.histogram(data_frame=df, x="Regression")
    st.write(fig2)

with fig_col1:
    st.markdown("Graph")
    fig3 = px.line(df, x='Ridge', y=df.columns[2:], width=1200)
    st.write(fig3)

Show a Table

#st.markdown(‘### Detailed View’)
#st.dataframe(df)

streamlit run app+.py - in Terminal

I cannot replicate your error message. But the following setup/code works for me without errors.

requirements.txt

streamlit
statsmodels
scikit-learn
plotly

Install requirements

pip install -r requirements.txt

app.py

import pandas as pd # read csv, df manipulation
import plotly.express as px # interactive charts
import streamlit as st # data web application development

# import pandas as pd
# import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV # Alpha of Lasso
from sklearn import linear_model # Ridge Regression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

from statsmodels.tsa.arima.model import ARIMA

#from tabulate import tabulate
from numpy import arange

st.set_page_config(
    page_title='Real-Time Data Science Dashboard',
    page_icon='Active',
    layout='wide',
)

st.title('Macroeconomic Analysis')
st.markdown('This dashboard will help you get more information about the GDP and Reserves and plots the results')

file=st.file_uploader('Upload CSV data file', type=['csv'])

if file is not None:
    data= pd.read_csv(file)
    data['GDP'] = data['GDP'].diff()
    data.dropna(inplace=True)
    data.reset_index(inplace=True, drop=True)
    y = data['GDP']
    x = data.iloc[:, 2:]

    # Split the Sample at hand
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    Reg = LinearRegression()
    Reg.fit(x_train, y_train)

    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    #  k = 10 folds and repeat the cross-validation process 3 times.
    model = RidgeCV(alphas=arange(0.1, 10, 1), cv=cv, scoring='neg_mean_absolute_error')
    model.fit(x_train, y_train)
    model.alpha_
    Ridge = linear_model.Ridge(alpha=model.alpha_)
    Ridge.fit(x_train, y_train)

    Polynomial = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression(fit_intercept=False))])
    Polynomial.fit(x_train, y_train)

    #  ----- In Sample predictions with each model & Corresponding tests -----
    y_pred = Reg.predict(x_test)
    Reg_r2 = r2_score(y_test, y_pred)
    Reg_MSE = mean_squared_error(y_test, y_pred)
    # mean_absolute_error(y_test, y_pred)

    y_pred = Ridge.predict(x_test)
    Ridge_r2 = r2_score(y_test, y_pred)
    Ridge_MSE = mean_squared_error(y_test, y_pred)

    model_pred = {
        "Linear Regression": {'Reg_MSE': Reg_MSE, 'Reg_r2': Reg_r2},
        "Ridge regression ": {'Ridge_MSE': Ridge_MSE, 'Ridge_r2': Ridge_r2},
        # "Polynomial regression": poly_MSE,
        # "Support Vector Machine": SVR_Prediction,
        # "Random forest regression":Forest_Prediction,
    }
    table_header = ['Test', 'Regression', 'Ridge', 'Polynomial']
    Table = [
        ['MSE', round(Reg_MSE, 2), round(Ridge_MSE, 2), round(Reg_MSE, 2)],
        ['R-Squared', round(Reg_r2, 2), round(Ridge_r2, 2), round(Reg_r2, 2)]
    ]

    del data
    df = pd.DataFrame(Table, columns=table_header)  # Table to df
    st.write(df)


    # Create the Filter first:
    # top-level filters
    job_filter = st.selectbox("Select the Job", pd.unique(df['Test']))
    # dataframe filter
    df = df[df['Test'] == job_filter]
    period=df['Ridge'].iloc[-1]

    # KPI
    col1, col2, col3=st.columns(3)

    with col1:
        st.metric(
            label = 'GDP',
            value = round (df['Ridge'].mean())     # Rounding here eliminates decimals.
    )

    with col2:
        st.metric(
            label = 'Reserves',
            value = int(df['Regression'].mean())
    )

    with col3:
        st.metric(
            label = 'FDI',
            value = f'$ {period}'
    )

    fig_col1, fig_col2,fig_col3= st.columns([0.45,0.45,0.1])
    # width and length 400 default.

    with fig_col1:
        st.markdown("First Chart")
        fig1 = px.histogram(data_frame=df, x="Ridge")
        st.write(fig1)

    with fig_col2:
        st.markdown("## Second Chart")
        fig2 = px.histogram(data_frame=df, x="Regression")
        st.write(fig2)

    with fig_col1:
        st.markdown("Graph")
        fig3 = px.line(df, x='Ridge', y=df.columns[2:], width=1200)
        st.write(fig3)

run the app

streamlit run app.py

Output

Thank you again ferdy . One last thing the txt file - Install requirements-

pip install -r requirements.txt

Is used for what when upon deploying? Knowing that we have the requirement.txt used for dependencies

Yes requirements.txt file is used for cloud deployment. It can also be used locally while developing the app.

1 Like