Hi all. I have a Streamlit app that is running a PCA analysis, which should not be super resource intensive. My code for the PCA analysis runs in Jupyter in 0.2 seconds, but takes extremely long to run when I put the same code into the Streamlit environment. And I also get an error saying âout of memoryâ in every browser I try (I am running locally for the moment).
So my code, even though it works extremely efficiently in Jupyter, wonât run properly in my Streamlit environment.
I have copied the code below, and you can imagine the 2 csv files as basic dataframes with several variables. They are large files, so I canât attach them here.
Any help appreciated.
Code:
import hydralit as hy
from numpy.core.fromnumeric import var
import streamlit
import streamlit as st
import sys
from streamlit import cli as stcli
from PIL import Image
from functions import *
import streamlit.components.v1 as components
import pandas as pd
from st_clickable_images import clickable_images
import numpy as np
import statsmodels.api as sm
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
import seaborn as sns
from io import BytesIO
from statsmodels.formula.api import ols
from streamlit.state.session_state import SessionState
import tkinter
import matplotlib
# matplotlib.use('TkAgg')
# matplotlib.use('Agg')
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from sklearn.tree import DecisionTreeRegressor, plot_tree
import sklearn
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
import time
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import dtale
from dtale.views import startup
from dtale.app import get_instance
import webbrowser
import dtale.global_state as global_state
import dtale.app as dtale_app
from matplotlib.pyplot import axis, hist
from scipy import stats as stats
from bioinfokit.analys import stat
from statsmodels.stats.anova import AnovaRM
import statsmodels.api as sm
from statsmodels.graphics.factorplots import interaction_plot
from sklearn.decomposition import PCA
title = '<p style="font-family:sans-serif; color:red; font-size: 39px; text-align: center;"><b>Code testing environment</b></p>'
st.markdown(title, unsafe_allow_html=True)
### From Jupyter - 0. Prepare the data
# Read data
df = pd.read_csv('allrecordsohe.csv', low_memory=False)
df2 = pd.read_csv('allrecords.csv', low_memory=False)
branddf = pd.read_csv('Brandname encoding.csv', low_memory=False)
# Check for empty data
df.isnull().sum()
df2.isnull().sum()
# Remove NaN
nr_samples_before = df.shape[0]
df = df.fillna(0)
print('Removed %s samples' % (nr_samples_before - df.shape[0]))
nr_samples_before = df2.shape[0]
df2 = df2.fillna(0)
print('Removed %s samples' % (nr_samples_before - df2.shape[0]))
# Drop irrelevant variables
df.drop(['TD_ID', 'KRUX_ID', 'TAP_IT_ID', 'GOOGLE_CLIENT_ID'], axis=1, inplace=True)
df2.drop(['TD_ID', 'KRUX_ID', 'TAP_IT_ID', 'GOOGLE_CLIENT_ID'], axis=1, inplace=True)
# df = df.reset_index()
# df2 = df2.reset_index()
### End
### Enter code to test here
with st.spinner('Please wait while we conduct principal component analysis'):
my_bar = st.progress(0)
time.sleep(1)
for percent_complete in range(100):
time.sleep(0.01)
my_bar.progress(percent_complete + 1)
start_time = time.time()
### From Jupyter - Principal component analysis
# Initially, visualize the important data features
# Scale the features
# Separating out the features
x = df.iloc[:, 1:-1].sample(10000).values #subsampling for efficiency and speed
# Separating out the target
y = df.iloc[:,0].sample(10000).values #subsampling for efficiency and speed
# Standardizing the features
x = StandardScaler().fit_transform(x)
# Dimensionality reduction
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4', 'principal component 5', 'principal component 6', 'principal component 7', 'principal component 8', 'principal component 9', 'principal component 10'])
# Concatenate DF across axis 1
finalDf = pd.concat([principalDf, df['BRAND']], axis = 1)
st.write("Table of top 10 principal components")
st.write(finalDf)
# Plot 2D data
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('PCA showing top 2 components', fontsize = 20)
targets = ['BRAND']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
indicesToKeep = finalDf['BRAND'] == target
ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
, finalDf.loc[indicesToKeep, 'principal component 2']
, c = color
, s = 50)
# ax.set_xticks([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
# ax.set_yticks([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
ax.set_xticks([-100, -10, -0.1, 0, 0.1, 1, 10, 100])
ax.set_yticks([-100, -10, -0.1, 0, 0.1, 1, 10, 100])
ax.legend(targets)
ax.grid()
buf = BytesIO()
fig.savefig(buf, format="png")
st.image(buf)
# Explain the variance
st.write("Explained variance from top 10 components:")
st.write(pca.explained_variance_ratio_)
### End
st.text("") # Spacer
st.write("")
st.write("Principal component analysis took ", time.time() - start_time, "seconds to run")
pca = st.button("Click to see how PCA can speed up machine learning and to run a new regression model")
if pca == True:
# st.session_state.pcasession = 'True'
with st.spinner('Please wait while we conduct a new linear regression using the principal components'):
my_bar = st.progress(0)
time.sleep(1)
for percent_complete in range(100):
time.sleep(0.1)
my_bar.progress(percent_complete + 1)
start_time = time.time()
### From Jupter - Principal component analysis continued
# Now use PCA to speed up machine learning
#from sklearn.model_selection import train_test_split
# test_size: what proportion of original data is used for test set
train_X, test_X, train_y, test_y = train_test_split(x, y, test_size=1/4.0, random_state=0)
# Scale the data
scaler = StandardScaler()
# Fit on training set only
scaler.fit(train_X)
# Apply transform to both the training set and the test set.
train_X = scaler.transform(train_X)
test_X = scaler.transform(test_X)
# Choose minimum number of principal components such that 95% of the variance is retained
# Make an instance of the model
pca = PCA(.95)
# Fit on training set
pca.fit(train_X)
# Apply the mapping (transformation) to both the training set and the test set
train_X = pca.transform(train_X)
test_X = pca.transform(test_X)
# Apply model of choice, e.g. logistic regression - this will become dynamic in the app; choose model here
# Determine number of components
st.write("Number of useful components:")
st.write(pca.n_components_)
# Determine components
st.write("Component contributions:")
st.write(pca.components_)
df3 = pd.DataFrame(pca.components_)
# st.table(df3)
### End
# tunedreg = st.button("Click to run a regression model with these components") # For brand
# if tunedreg == True and st.session_state.pcasession == True:
### From Jupyter - Linear regression
# Choose predicted variable - this will become dynamic in the app
y = df['BRAND'].sample(7500)
print(y.shape)
print(train_X.shape)
# Define predictor variables
x = train_X
x, y = np.array(x), np.array(y)
x = sm.add_constant(x)
model = sm.OLS(y, x)
results = model.fit()
st.subheader("PCA regression results:")
st.write(results.summary())
st.write("")
st.write('\nPredicted response:', results.fittedvalues, sep='\n') # Or print('predicted response:', results.predict(x), sep='\n')
st.write("")
st.write("Conducting a new linear regression with principal components took ", time.time() - start_time, "seconds to run")