Streamlit server consistently failed status checks. Model training takes too long

Locally my app works fine, but in the cloud it terminates. It starts with the load_model() code, but then kills it after some time because of the error.

app.py

import pickle
import streamlit as st
import requests
import numpy as np
from os import path

import model_bert
import model_tfidf

st.set_page_config(layout="wide")

st.markdown(
    """
    <style>
    /* Hide the scrollbar but keep scrolling functionality */
    ::-webkit-scrollbar {
        width: 0px;
        background: transparent; /* Make the scrollbar transparent */
    }
    </style>
    """,
    unsafe_allow_html=True  # Allow HTML tags in Markdown
)
    
def load_models(model):
    if path.exists(f"./data/similarity_{model}.pkl"):
        return pickle.load(open(f'./data/similarity_{model}.pkl', 'rb'))
    else:
        with st.spinner('Wait for models to train...'):
            model_tfidf.get_model()
            model_bert.get_model()
        if model == "tfidf": load_models("tfidf")
        elif model == "bert": load_models("bert")

# Initialize models if needed
load_models("tfidf")
load_models("bert")

# Load data using st.cache_data to prevent reloading on every run
@st.cache_data(show_spinner=True)
def load_data():
    return {
        'movies': pickle.load(open(r'./data/movie_list.pkl', 'rb')),
        'similarity_tfidf': load_models("tfidf"),
        'similarity_bert': load_models("bert"),       
    }

st.sidebar.title('Team 5')

# Load data
data = load_data()
movies = data['movies']

if 'watched_movies' not in st.session_state:
    st.session_state.watched_movies = []
if 'summed_matrix_histories' not in st.session_state:
    st.session_state.summed_matrix_histories = np.zeros(movies.shape[0])

def recommend(movie, use_history):
    if embed_type == 'TF-IDF':
        similarity = data['similarity_tfidf']
    else:
        similarity = data['similarity_bert']

    index = movies[movies['title'] == movie].index[0]

    if use_history:
        st.session_state.watched_movies.append(index)
        st.session_state.summed_matrix_histories = st.session_state.summed_matrix_histories + similarity[index]
        final_matrix = st.session_state.summed_matrix_histories
    else:
        final_matrix = similarity[index]

    distances = sorted(list(enumerate(final_matrix)), reverse=True, key=lambda x: x[1])
    recommended_movie_ids = []

    count = 0
    for index, item in distances[1:]:
        if index not in st.session_state.watched_movies:
            recommended_movie_ids.append(index)
            count = count + 1
            if count == 5:
                break
            
    return recommended_movie_ids
        

def display_selection_page():
    st.header('Movie Recommender System - Selection')

    global embed_type
    embed_type = st.sidebar.selectbox(
        'Embedding type:',
        ['TF-IDF', 'BERT']
    )
    
    use_history = st.checkbox("Use multiple histories")

    movie_list = movies['title'].values
    selected_movie = st.selectbox(
        "Type or select a movie from the dropdown",
        movie_list 
    )

    if st.button('Show Recommendation'):
        recommended_movie_ids = recommend(selected_movie, use_history)
        display_recommendations(recommended_movie_ids)

    if st.session_state.watched_movies:
        display_watched_movies()

# Display watched movies and reset button
def display_watched_movies():
    st.sidebar.write("Watched movies:")
    for i in st.session_state.watched_movies:
        st.sidebar.write(movies['title'][i])

    if st.sidebar.button("Reset"):
        st.session_state.watched_movies = []
        st.session_state.summed_matrix_histories = np.zeros(movies.shape[0])

# Display movie recommendations
def display_recommendations(recommended_movie_ids):
    
    columns = st.columns(5)

    for i, index in enumerate(recommended_movie_ids):
        movie = movies.iloc[index]
        if not movie.empty:
            title = movie['title']
            director = ', '.join(movie['director']) if movie['director'] else "-"
            cast = ', '.join(movie['cast']) if movie['cast'] else "-"
            genre = ', '.join(movie['listed_in']) if movie['listed_in'] else "-"
            country = ', '.join(movie['country']) if movie['country'] else "-"
            release = movie['release_year'] if movie['release_year'] else "-"

            # Display each movie in a separate column
            with columns[i]:
                st.text(capitalize_sentence(title))
                try:
                    st.image(get_image_from_tmdb(title), use_column_width=True)
                except Exception as e:
                    print("Failed to load image")
                    print(f"Error: {e}")    
                    st.image('./data/images/empty.jpg', use_column_width=True)
                st.write("**Country:**", capitalize_sentence(country))
                st.write("**Genre:**", capitalize_sentence(genre))
        else:
            st.write(f"Movie '{index}' not found in the dataset.")


def get_image_from_tmdb(movie_name):
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer ey"
    }
    url = f"https://api.themoviedb.org/3/search/movie?query={movie_name}"
    response = requests.get(url, headers=headers)
    #print(response.json())
    return f'https://image.tmdb.org/t/p/w185{response.json()["results"][0]["poster_path"]}'


def capitalize_sentence(string):
    # Split the string into sentences
    sentences = string.split(' ')

    # Capitalize the first letter of each sentence
    capitalized_sentences = [sentence.capitalize() for sentence in sentences]

    # Join the capitalized sentences back into a single string
    return ' '.join(capitalized_sentences)

# Display content based on the selected page
def display_prompt_page():
    st.header("Movie Recommender System - Prompt")

    movie_prompt = st.text_area("Describe your ideal movie", value="", height=200)

    if st.button('Show Recommendation'):
        with st.spinner("Generating recommendation..."):
            generate_recommendation(movie_prompt)

# Generate recommendation using GPT model and display embedding
def generate_recommendation(movie_prompt):    
        response = requests.post("http://localhost:5000/embed", json={"prompt": movie_prompt})
        recommended_movies = response.json()["recommended_movie_ids"]
        display_recommendations(recommended_movies)

# Main function to display selected page
def main():
    global embed_type
    
    page = st.sidebar.selectbox(
        "Method type",
        ["Selection", "Prompt"]
    )

    if page == "Selection":
        display_selection_page()
    elif page == "Prompt":
        display_prompt_page()

# Run the app
if __name__ == "__main__":
    main()

Error:

[23:09:26] 📦 Processed dependencies!




[nltk_data] Downloading package punkt to /home/appuser/nltk_data...

[nltk_data]   Unzipping tokenizers/punkt.zip.

[nltk_data] Downloading package stopwords to

[nltk_data]     /home/appuser/nltk_data...[2024-06-10 23:09:48.504200] 

[nltk_data]   Unzipping corpora/stopwords.zip.

[23:13:00] ❗️ The service has encountered an error while checking the health of the Streamlit app: Get "http://localhost:8501/healthz": read tcp 10.12.168.103:43902->10.12.168.103:8501: read: connection reset by peer

/app/scripts/run-streamlit.sh: line 9:   207 Killed                  sudo -E -u appuser /home/adminuser/venv/bin/streamlit "$@"

[23:14:32] ❗️ Streamlit server consistently failed status checks

[23:14:32] ❗️ Please fix the errors, push an update to the git repo, or reboot the app.

requirements.txt

numpy==1.26.4
Requests==2.32.2
sentence_transformers==2.7.0
nltk==3.8.1
setuptools==58.1.0
streamlit==1.35.0

model_bert.py

import pandas as pd
import numpy as np
import ast
import pickle
import torch
from collections import Counter
from sentence_transformers import SentenceTransformer

def get_model():
    # Read the CSV files
    history_df = pd.read_csv('./data/netflix_history_preprocessed.csv')
    titles_df = pd.read_csv('./data/netflix_titles_preprocessed.csv')

    # Convert string representation of list to actual list
    titles_df['director'] = titles_df['director'].apply(ast.literal_eval)
    titles_df['cast'] = titles_df['cast'].apply(ast.literal_eval)
    titles_df['country'] = titles_df['country'].apply(ast.literal_eval)
    titles_df['listed_in'] = titles_df['listed_in'].apply(ast.literal_eval)

    # Keep only the first occurrence of each title
    titles_df = titles_df.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)

    history_titles_set = set(history_df['Title'])
    titles_set = set(titles_df['title'])
    overlaps = history_titles_set.intersection(titles_set)
    en_history_df = history_df[history_df['Title'].isin(overlaps)]
    watch_history = en_history_df['Title'].to_list()

    # Flatten the list of actor names
    actor_names = [name for sublist in titles_df['cast'] for name in sublist]

    # Count the occurrences of each actor name
    name_counts = Counter(actor_names)

    def keep_top_three_actors(actor_list):
        if len(actor_list) == 0:
            return []
        # Keep only the top k most frequent actors
        actor_list.sort(key=lambda x: name_counts[x], reverse=True)
        return actor_list[:3]

    titles_df['cast'] = titles_df['cast'].apply(keep_top_three_actors)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
    descriptions = titles_df['description'].to_list()
    descriptions_embeddings = model.encode(descriptions, convert_to_tensor=True)
    descriptions_similarity_scores = torch.matmul(descriptions_embeddings, descriptions_embeddings.T).cpu().numpy()
    # Evaluation of the recommendation

    def evaluate(similarity_scores, consider_history=False):
        target_ranks = []
        scores = np.zeros(similarity_scores.shape[0])
        
        for i in range(1, len(watch_history)):
            target_title = watch_history[i]
            target_row_index = titles_df.index[titles_df['title'] == target_title].tolist()[0]
            prev_title = watch_history[i - 1]
            prev_row_index = titles_df.index[titles_df['title'] == prev_title].tolist()[0]
        
            # Get recommendation based on the similarity
            if consider_history:
                scores = 1 / 2 * scores + 1 / 2 * similarity_scores[prev_row_index]
            else:
                scores = similarity_scores[prev_row_index]
            recommendation_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
            target_rank = recommendation_indices.index(target_row_index)
            target_ranks.append(target_rank)
        
        print('Average rank:', np.mean(target_ranks))
        print('Successful recommendations:', np.sum(np.array(target_ranks) <= 5))

    metadata = []

    for index, row in titles_df.iterrows():
        text = ''
        if row['director']:
            text += f"the director is {','.join(row['director'])}. "
        if row['cast']:
            text += f"the leading actors are {','.join(row['cast'])}. "
        if row['country']:
            text += f"the movie is from {','.join(row['country'])}. "
        if row['release_year']:
            text += f"the movie is released in {row['release_year']}. "
        if row['listed_in']:
            text += f"the movie falls within the genre of {','.join(row['listed_in'])}. "
        
        metadata.append(text)
        
    metadata_embeddings = model.encode(metadata, convert_to_tensor=True)
    metadata_similarity_scores = torch.matmul(metadata_embeddings, metadata_embeddings.T).cpu().numpy()

    pickle.dump(titles_df, open('./data/movie_list.pkl', 'wb'))
    pickle.dump(descriptions_similarity_scores + metadata_similarity_scores, open('./data/similarity_bert.pkl', 'wb'))
    pickle.dump(descriptions_embeddings.cpu().numpy(), open('./data/descriptions_embeddings.pkl', 'wb'))
    pickle.dump(metadata_embeddings.cpu().numpy(), open('./data/metadata_embeddings.pkl', 'wb'))

if __name__ == "__main__":
    get_model()

model_tfidf.py

import pandas as pd
import numpy as np
import nltk
import string
import ast
import pickle
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_model():
    nltk.download('punkt')
    nltk.download('stopwords')
    # Read the CSV files
    history_df = pd.read_csv('./data/netflix_history_preprocessed.csv')
    titles_df = pd.read_csv('./data/netflix_titles_preprocessed.csv')

    # Convert string representation of list to actual list
    titles_df['director'] = titles_df['director'].apply(ast.literal_eval)
    titles_df['cast'] = titles_df['cast'].apply(ast.literal_eval)
    titles_df['country'] = titles_df['country'].apply(ast.literal_eval)
    titles_df['listed_in'] = titles_df['listed_in'].apply(ast.literal_eval)

    # Keep only the first occurrence of each title
    titles_df = titles_df.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)

    history_titles_set = set(history_df['Title'])
    titles_set = set(titles_df['title'])

    overlaps = history_titles_set.intersection(titles_set)
    en_history_df = history_df[history_df['Title'].isin(overlaps)]
    watch_history = en_history_df['Title'].to_list()

    def preprocess_text(text):
        # Tokenization
        tokens = nltk.tokenize.word_tokenize(text.lower())

        # Remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]

        # Remove stop words
        stop_words = set(nltk.corpus.stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

        # Stemming
        stemmer = nltk.stem.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

        return ' '.join(tokens)

    titles_df['description'] = titles_df['description'].apply(preprocess_text)

    def preprocess_name(name_list):
        # Remove spaces between each name
        return [name.replace(' ', '') for name in name_list]

    titles_df['director'] = titles_df['director'].apply(preprocess_name)
    titles_df['cast'] = titles_df['cast'].apply(preprocess_name)

    # Flatten the list of actor names
    actor_names = [name for sublist in titles_df['cast'] for name in sublist]

    # Count the occurrences of each actor name
    name_counts = Counter(actor_names)

    def keep_top_three_actors(actor_list):
        if len(actor_list) == 0:
            return []
        # Keep only the top k most frequent actors
        actor_list.sort(key=lambda x: name_counts[x], reverse=True)
        return actor_list[:3]

    titles_df['cast'] = titles_df['cast'].apply(keep_top_three_actors)

    # Calculate TF-IDF vectors for processed titles and descriptions
    tfidf_vectorizer = TfidfVectorizer()
    titles_tfidf = tfidf_vectorizer.fit_transform(titles_df['description'])

    # Calculate cosine similarity
    similarity_scores = cosine_similarity(titles_tfidf, titles_tfidf)

    # Function to check if two lists have overlapping elements
    def have_overlap(list1, list2):
        return bool(set(list1) & set(list2))

    def create_overlap_matrix(column_name):
        matrix_size = len(titles_df)
        overlap_matrix = np.zeros((matrix_size, matrix_size), dtype=int)

        column = titles_df[column_name].to_list()
        for i in range(matrix_size):
            for j in range(matrix_size):
                if have_overlap(column[i], column[j]):
                    overlap_matrix[i, j] = 1

        return overlap_matrix

    overlap_director = create_overlap_matrix('director')
    overlap_cast = create_overlap_matrix('cast')
    overlap_country = create_overlap_matrix('country')
    overlap_genre = create_overlap_matrix('listed_in')

    # Evaluation of the recommendation
    def evaluate(x1, x2, x3, x4, x5, consider_history=False):
        target_ranks = []
        combined_scores = x1 * similarity_scores + x2 * overlap_director + x3 * overlap_cast + x4 * overlap_country + x5 * overlap_genre
        scores = np.zeros(combined_scores.shape[0])

        for i in range(1, len(watch_history)):
            target_title = watch_history[i]
            target_row_index = titles_df.index[titles_df['title'] == target_title].tolist()[0]
            prev_title = watch_history[i - 1]
            prev_row_index = titles_df.index[titles_df['title'] == prev_title].tolist()[0]

            # Get recommendation based on the similarity
            if consider_history:
                scores += combined_scores[prev_row_index]
            else:
                scores = combined_scores[prev_row_index]
            recommendation_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
            target_rank = recommendation_indices.index(target_row_index)
            target_ranks.append(target_rank)

        print('Average rank:', np.mean(target_ranks))
        print('Successful recommendations:', np.sum(np.array(target_ranks) <= 5))
        

    combined_scores = 50 * similarity_scores + 1 * overlap_director + 2 * overlap_cast + 0.5 * overlap_country + 2 * overlap_genre
    combined_scores = np.array(combined_scores, dtype=np.float32)

    pickle.dump(combined_scores, open('./data/similarity_tfidf.pkl', 'wb'))

if __name__ == "__main__":
    get_model()