Locally my app works fine, but in the cloud it terminates. It starts with the load_model() code, but then kills it after some time because of the error.
app.py
import pickle
import streamlit as st
import requests
import numpy as np
from os import path
import model_bert
import model_tfidf
st.set_page_config(layout="wide")
st.markdown(
"""
<style>
/* Hide the scrollbar but keep scrolling functionality */
::-webkit-scrollbar {
width: 0px;
background: transparent; /* Make the scrollbar transparent */
}
</style>
""",
unsafe_allow_html=True # Allow HTML tags in Markdown
)
def load_models(model):
if path.exists(f"./data/similarity_{model}.pkl"):
return pickle.load(open(f'./data/similarity_{model}.pkl', 'rb'))
else:
with st.spinner('Wait for models to train...'):
model_tfidf.get_model()
model_bert.get_model()
if model == "tfidf": load_models("tfidf")
elif model == "bert": load_models("bert")
# Initialize models if needed
load_models("tfidf")
load_models("bert")
# Load data using st.cache_data to prevent reloading on every run
@st.cache_data(show_spinner=True)
def load_data():
return {
'movies': pickle.load(open(r'./data/movie_list.pkl', 'rb')),
'similarity_tfidf': load_models("tfidf"),
'similarity_bert': load_models("bert"),
}
st.sidebar.title('Team 5')
# Load data
data = load_data()
movies = data['movies']
if 'watched_movies' not in st.session_state:
st.session_state.watched_movies = []
if 'summed_matrix_histories' not in st.session_state:
st.session_state.summed_matrix_histories = np.zeros(movies.shape[0])
def recommend(movie, use_history):
if embed_type == 'TF-IDF':
similarity = data['similarity_tfidf']
else:
similarity = data['similarity_bert']
index = movies[movies['title'] == movie].index[0]
if use_history:
st.session_state.watched_movies.append(index)
st.session_state.summed_matrix_histories = st.session_state.summed_matrix_histories + similarity[index]
final_matrix = st.session_state.summed_matrix_histories
else:
final_matrix = similarity[index]
distances = sorted(list(enumerate(final_matrix)), reverse=True, key=lambda x: x[1])
recommended_movie_ids = []
count = 0
for index, item in distances[1:]:
if index not in st.session_state.watched_movies:
recommended_movie_ids.append(index)
count = count + 1
if count == 5:
break
return recommended_movie_ids
def display_selection_page():
st.header('Movie Recommender System - Selection')
global embed_type
embed_type = st.sidebar.selectbox(
'Embedding type:',
['TF-IDF', 'BERT']
)
use_history = st.checkbox("Use multiple histories")
movie_list = movies['title'].values
selected_movie = st.selectbox(
"Type or select a movie from the dropdown",
movie_list
)
if st.button('Show Recommendation'):
recommended_movie_ids = recommend(selected_movie, use_history)
display_recommendations(recommended_movie_ids)
if st.session_state.watched_movies:
display_watched_movies()
# Display watched movies and reset button
def display_watched_movies():
st.sidebar.write("Watched movies:")
for i in st.session_state.watched_movies:
st.sidebar.write(movies['title'][i])
if st.sidebar.button("Reset"):
st.session_state.watched_movies = []
st.session_state.summed_matrix_histories = np.zeros(movies.shape[0])
# Display movie recommendations
def display_recommendations(recommended_movie_ids):
columns = st.columns(5)
for i, index in enumerate(recommended_movie_ids):
movie = movies.iloc[index]
if not movie.empty:
title = movie['title']
director = ', '.join(movie['director']) if movie['director'] else "-"
cast = ', '.join(movie['cast']) if movie['cast'] else "-"
genre = ', '.join(movie['listed_in']) if movie['listed_in'] else "-"
country = ', '.join(movie['country']) if movie['country'] else "-"
release = movie['release_year'] if movie['release_year'] else "-"
# Display each movie in a separate column
with columns[i]:
st.text(capitalize_sentence(title))
try:
st.image(get_image_from_tmdb(title), use_column_width=True)
except Exception as e:
print("Failed to load image")
print(f"Error: {e}")
st.image('./data/images/empty.jpg', use_column_width=True)
st.write("**Country:**", capitalize_sentence(country))
st.write("**Genre:**", capitalize_sentence(genre))
else:
st.write(f"Movie '{index}' not found in the dataset.")
def get_image_from_tmdb(movie_name):
headers = {
"accept": "application/json",
"Authorization": f"Bearer ey"
}
url = f"https://api.themoviedb.org/3/search/movie?query={movie_name}"
response = requests.get(url, headers=headers)
#print(response.json())
return f'https://image.tmdb.org/t/p/w185{response.json()["results"][0]["poster_path"]}'
def capitalize_sentence(string):
# Split the string into sentences
sentences = string.split(' ')
# Capitalize the first letter of each sentence
capitalized_sentences = [sentence.capitalize() for sentence in sentences]
# Join the capitalized sentences back into a single string
return ' '.join(capitalized_sentences)
# Display content based on the selected page
def display_prompt_page():
st.header("Movie Recommender System - Prompt")
movie_prompt = st.text_area("Describe your ideal movie", value="", height=200)
if st.button('Show Recommendation'):
with st.spinner("Generating recommendation..."):
generate_recommendation(movie_prompt)
# Generate recommendation using GPT model and display embedding
def generate_recommendation(movie_prompt):
response = requests.post("http://localhost:5000/embed", json={"prompt": movie_prompt})
recommended_movies = response.json()["recommended_movie_ids"]
display_recommendations(recommended_movies)
# Main function to display selected page
def main():
global embed_type
page = st.sidebar.selectbox(
"Method type",
["Selection", "Prompt"]
)
if page == "Selection":
display_selection_page()
elif page == "Prompt":
display_prompt_page()
# Run the app
if __name__ == "__main__":
main()
Error:
[23:09:26] 📦 Processed dependencies!
[nltk_data] Downloading package punkt to /home/appuser/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data] /home/appuser/nltk_data...[2024-06-10 23:09:48.504200]
[nltk_data] Unzipping corpora/stopwords.zip.
[23:13:00] ❗️ The service has encountered an error while checking the health of the Streamlit app: Get "http://localhost:8501/healthz": read tcp 10.12.168.103:43902->10.12.168.103:8501: read: connection reset by peer
/app/scripts/run-streamlit.sh: line 9: 207 Killed sudo -E -u appuser /home/adminuser/venv/bin/streamlit "$@"
[23:14:32] ❗️ Streamlit server consistently failed status checks
[23:14:32] ❗️ Please fix the errors, push an update to the git repo, or reboot the app.
requirements.txt
numpy==1.26.4
Requests==2.32.2
sentence_transformers==2.7.0
nltk==3.8.1
setuptools==58.1.0
streamlit==1.35.0
model_bert.py
import pandas as pd
import numpy as np
import ast
import pickle
import torch
from collections import Counter
from sentence_transformers import SentenceTransformer
def get_model():
# Read the CSV files
history_df = pd.read_csv('./data/netflix_history_preprocessed.csv')
titles_df = pd.read_csv('./data/netflix_titles_preprocessed.csv')
# Convert string representation of list to actual list
titles_df['director'] = titles_df['director'].apply(ast.literal_eval)
titles_df['cast'] = titles_df['cast'].apply(ast.literal_eval)
titles_df['country'] = titles_df['country'].apply(ast.literal_eval)
titles_df['listed_in'] = titles_df['listed_in'].apply(ast.literal_eval)
# Keep only the first occurrence of each title
titles_df = titles_df.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)
history_titles_set = set(history_df['Title'])
titles_set = set(titles_df['title'])
overlaps = history_titles_set.intersection(titles_set)
en_history_df = history_df[history_df['Title'].isin(overlaps)]
watch_history = en_history_df['Title'].to_list()
# Flatten the list of actor names
actor_names = [name for sublist in titles_df['cast'] for name in sublist]
# Count the occurrences of each actor name
name_counts = Counter(actor_names)
def keep_top_three_actors(actor_list):
if len(actor_list) == 0:
return []
# Keep only the top k most frequent actors
actor_list.sort(key=lambda x: name_counts[x], reverse=True)
return actor_list[:3]
titles_df['cast'] = titles_df['cast'].apply(keep_top_three_actors)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
descriptions = titles_df['description'].to_list()
descriptions_embeddings = model.encode(descriptions, convert_to_tensor=True)
descriptions_similarity_scores = torch.matmul(descriptions_embeddings, descriptions_embeddings.T).cpu().numpy()
# Evaluation of the recommendation
def evaluate(similarity_scores, consider_history=False):
target_ranks = []
scores = np.zeros(similarity_scores.shape[0])
for i in range(1, len(watch_history)):
target_title = watch_history[i]
target_row_index = titles_df.index[titles_df['title'] == target_title].tolist()[0]
prev_title = watch_history[i - 1]
prev_row_index = titles_df.index[titles_df['title'] == prev_title].tolist()[0]
# Get recommendation based on the similarity
if consider_history:
scores = 1 / 2 * scores + 1 / 2 * similarity_scores[prev_row_index]
else:
scores = similarity_scores[prev_row_index]
recommendation_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
target_rank = recommendation_indices.index(target_row_index)
target_ranks.append(target_rank)
print('Average rank:', np.mean(target_ranks))
print('Successful recommendations:', np.sum(np.array(target_ranks) <= 5))
metadata = []
for index, row in titles_df.iterrows():
text = ''
if row['director']:
text += f"the director is {','.join(row['director'])}. "
if row['cast']:
text += f"the leading actors are {','.join(row['cast'])}. "
if row['country']:
text += f"the movie is from {','.join(row['country'])}. "
if row['release_year']:
text += f"the movie is released in {row['release_year']}. "
if row['listed_in']:
text += f"the movie falls within the genre of {','.join(row['listed_in'])}. "
metadata.append(text)
metadata_embeddings = model.encode(metadata, convert_to_tensor=True)
metadata_similarity_scores = torch.matmul(metadata_embeddings, metadata_embeddings.T).cpu().numpy()
pickle.dump(titles_df, open('./data/movie_list.pkl', 'wb'))
pickle.dump(descriptions_similarity_scores + metadata_similarity_scores, open('./data/similarity_bert.pkl', 'wb'))
pickle.dump(descriptions_embeddings.cpu().numpy(), open('./data/descriptions_embeddings.pkl', 'wb'))
pickle.dump(metadata_embeddings.cpu().numpy(), open('./data/metadata_embeddings.pkl', 'wb'))
if __name__ == "__main__":
get_model()
model_tfidf.py
import pandas as pd
import numpy as np
import nltk
import string
import ast
import pickle
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def get_model():
nltk.download('punkt')
nltk.download('stopwords')
# Read the CSV files
history_df = pd.read_csv('./data/netflix_history_preprocessed.csv')
titles_df = pd.read_csv('./data/netflix_titles_preprocessed.csv')
# Convert string representation of list to actual list
titles_df['director'] = titles_df['director'].apply(ast.literal_eval)
titles_df['cast'] = titles_df['cast'].apply(ast.literal_eval)
titles_df['country'] = titles_df['country'].apply(ast.literal_eval)
titles_df['listed_in'] = titles_df['listed_in'].apply(ast.literal_eval)
# Keep only the first occurrence of each title
titles_df = titles_df.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)
history_titles_set = set(history_df['Title'])
titles_set = set(titles_df['title'])
overlaps = history_titles_set.intersection(titles_set)
en_history_df = history_df[history_df['Title'].isin(overlaps)]
watch_history = en_history_df['Title'].to_list()
def preprocess_text(text):
# Tokenization
tokens = nltk.tokenize.word_tokenize(text.lower())
# Remove punctuation
tokens = [token for token in tokens if token not in string.punctuation]
# Remove stop words
stop_words = set(nltk.corpus.stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
# Stemming
stemmer = nltk.stem.PorterStemmer()
tokens = [stemmer.stem(token) for token in tokens]
return ' '.join(tokens)
titles_df['description'] = titles_df['description'].apply(preprocess_text)
def preprocess_name(name_list):
# Remove spaces between each name
return [name.replace(' ', '') for name in name_list]
titles_df['director'] = titles_df['director'].apply(preprocess_name)
titles_df['cast'] = titles_df['cast'].apply(preprocess_name)
# Flatten the list of actor names
actor_names = [name for sublist in titles_df['cast'] for name in sublist]
# Count the occurrences of each actor name
name_counts = Counter(actor_names)
def keep_top_three_actors(actor_list):
if len(actor_list) == 0:
return []
# Keep only the top k most frequent actors
actor_list.sort(key=lambda x: name_counts[x], reverse=True)
return actor_list[:3]
titles_df['cast'] = titles_df['cast'].apply(keep_top_three_actors)
# Calculate TF-IDF vectors for processed titles and descriptions
tfidf_vectorizer = TfidfVectorizer()
titles_tfidf = tfidf_vectorizer.fit_transform(titles_df['description'])
# Calculate cosine similarity
similarity_scores = cosine_similarity(titles_tfidf, titles_tfidf)
# Function to check if two lists have overlapping elements
def have_overlap(list1, list2):
return bool(set(list1) & set(list2))
def create_overlap_matrix(column_name):
matrix_size = len(titles_df)
overlap_matrix = np.zeros((matrix_size, matrix_size), dtype=int)
column = titles_df[column_name].to_list()
for i in range(matrix_size):
for j in range(matrix_size):
if have_overlap(column[i], column[j]):
overlap_matrix[i, j] = 1
return overlap_matrix
overlap_director = create_overlap_matrix('director')
overlap_cast = create_overlap_matrix('cast')
overlap_country = create_overlap_matrix('country')
overlap_genre = create_overlap_matrix('listed_in')
# Evaluation of the recommendation
def evaluate(x1, x2, x3, x4, x5, consider_history=False):
target_ranks = []
combined_scores = x1 * similarity_scores + x2 * overlap_director + x3 * overlap_cast + x4 * overlap_country + x5 * overlap_genre
scores = np.zeros(combined_scores.shape[0])
for i in range(1, len(watch_history)):
target_title = watch_history[i]
target_row_index = titles_df.index[titles_df['title'] == target_title].tolist()[0]
prev_title = watch_history[i - 1]
prev_row_index = titles_df.index[titles_df['title'] == prev_title].tolist()[0]
# Get recommendation based on the similarity
if consider_history:
scores += combined_scores[prev_row_index]
else:
scores = combined_scores[prev_row_index]
recommendation_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
target_rank = recommendation_indices.index(target_row_index)
target_ranks.append(target_rank)
print('Average rank:', np.mean(target_ranks))
print('Successful recommendations:', np.sum(np.array(target_ranks) <= 5))
combined_scores = 50 * similarity_scores + 1 * overlap_director + 2 * overlap_cast + 0.5 * overlap_country + 2 * overlap_genre
combined_scores = np.array(combined_scores, dtype=np.float32)
pickle.dump(combined_scores, open('./data/similarity_tfidf.pkl', 'wb'))
if __name__ == "__main__":
get_model()