After every browser refresh, memory increases in kubernetes

Summary

I have a dashboard deployed in the kubernetes, the application connects to azure blob storage using pyarrow (fsspec). The memory gets stable but with every refresh the memory increases a bit and gets stable, but there is increase in memory and memory does not go down after compute.

Steps to reproduce

Code snippet:

import pyarrow.dataset as ds
import pyarrow.parquet as pq
import polars as pl
import altair as alt

import streamlit as st

@st.cache_resource
def get_fs():
    return return fsspec.filesystem('abfs'), 'abfs://'


loc_data_fs, loc_data_appender = get_fs()


@st.cache_resource(ttl=6 * 3600)
def get_dataset(file_path):
    connection_to_dataset = ds.dataset(file_path, filesystem=loc_data_fs, partitioning='hive')
    return connection_to_dataset

@st.cache_resource(ttl=6*3600)
def load_rgs() -> pl.DataFrame:
    path = RESOURCE_GROUP_PATH
  
    ds = get_dataset(path)
    df = pl.scan_pyarrow_dataset(ds).select(pl.col(["B", "C"])).collect()
    return df


def main():
    subs = load_subs()
    # static assets never loaded again and thus cached

    rgs = load_rgs()
    mcateg = load_metercategories()

    # dynamic selections begin
    col1, col2, col3, col4, col5 = st.columns([2, 2, 1, 1, 1])
    dt_end = get_max_date()

    with col1:
        sel_a = st.multiselect("B", subs)

    with col2:
        r = rgs.filter(pl.col("B").is_in(sel_subs)).select(
            pl.col("C")).drop_nulls().to_pandas()

        sel_b = st.multiselect("C", r)

    with col3:
        ds = st.date_input("StartDate",
                           dt_end - datetime.timedelta(30))  # datetime.date(2023, 1, 1))

    with col4:
        de = st.date_input("EndDate", dt_end)  # datetime.date(2023, 1, 1))

    with col5:
        sel_mc = st.multiselect("Meter Category", mcateg.to_pandas())


    filters = []
    
    filters.append(('A', '>=', ds.strftime('%Y-%m-%d')))
    filters.append(('A', '<=', de.strftime('%Y-%m-%d')))

    if not sel_a:
        filters.append(('B', 'in', sel_a))

    if sel_b:
        filters.append(('C', 'in', sel_b))
        # df = df.filter(pl.col("C").is_in(sel_b))

    if sel_mc:
        filters.append(('E', 'in', sel_mc))
        # df = df.filter(pl.col("E").is_in(sel_mc))

    df = pl.from_pandas(
        pq.read_table(METRICS_PATH,
                      partitioning='hive',
                      filters=filters,
                      filesystem=loc_data_fs,
                      columns=[
                          "A", "B", "C", "D",
                          "E", "F"
                      ]).to_pandas())

    col1, col2, col3, col4, col5 = st.columns(5)

    with col1:
        st.metric("Subs", len(sel_a))
    with col2:
        if not sel_b:
            sel_rgs = rgs.to_pandas()['C'].to_list()
        st.metric("Resource Groups", len(sel_b))
    with col3:
        st.metric("Rec", df.select(pl.count("*"))[0, 0])
    with col4:
        st.metric("Spend",
                  round(df.select(pl.col("F")).sum()[0, 0], 3))
    with col5:
        st.metric("Data Size (mb)", round(df.estimated_size("mb")))

    # charts
    df_sub = df.groupby(["B",
                         "A"]).agg(pl.col("F").sum())
    sub_chart = alt.Chart(df_sub.to_pandas()).mark_bar().encode(
        x='A',
        y='F',
        color='B',
        tooltip=['B', 'F', 'A']).interactive()

    st.altair_chart(sub_chart, use_container_width=True)

    dfr = df.groupby(["A", "C"]).agg(pl.col("F").sum())
    reschart = alt.Chart(dfr.to_pandas()).mark_bar().encode(
        x='A',
        y='F',
        color='C',
        tooltip=['C', 'F', 'A'])

    st.altair_chart(reschart, use_container_width=True)

    df_prod = df.groupby(["A", "D"]).agg(pl.col("F").sum())

    prod_chart = alt.Chart(df_prod.to_pandas()).mark_bar().encode(
        x='A',
        y='F',
        color='D',
        tooltip=['D', 'F', 'A'])

    st.altair_chart(prod_chart, use_container_width=True)
    df_mcat = df.groupby(["A",
                          "E"]).agg(pl.col("F").sum())

    mcat_chart = alt.Chart(df_mcat.to_pandas()).mark_bar().encode(
        x='A',
        y='F',
        color='E',
        tooltip=['E', 'F', 'A'])

    st.altair_chart(mcat_chart, use_container_width=True)

    del df
    del filters

    del prod_chart
    del df_prod

    del df_sub
    del sub_chart

    del reschart
    del dfr

    del mcat_chart
    del df_mcat


if __name__ == '__main__':
    print('before:', len(gc.get_objects()))
    main()
    gc.collect()
    print('after:', len(gc.get_objects()))

These are the code snippets, it is not the complete code, but it can explain.

Expected behavior:

When I run it locally in Macos inside docker container, the memory always gets stable near 750 mib after no use, but running inside kubernetes cluster, after some web browser refreshes, the memory increases from 750 mib to 900 mib, then after some more querying the memory gets stable at around 1000 mib after a while of no use, the number of objects printed by gc.get_objects() also increases.

Actual behavior:

Explain the undesired behavior or error you see when you run the code above.
If youโ€™re seeing an error message, share the full contents of the error message here.

Debug info

  • Streamlit version: 1.25.0
  • Python version: 3.9
  • OS version: linux
  • Browser version: chrome

Hi @Sanket_Gupta

Have you tried profiling your Streamlit app, thereโ€™s a Python library for this called streamlit-profiler.

Please see this forum post introducing the library:

And the PyPI page streamlit-profiler ยท PyPI

1 Like

Hello @dataprofessor,

We havenโ€™t yet. Thank you for the update. We will definitely try to get the results soon and update the forum for further discussion.

Hello @Sanket_Gupta,

Did you find a solution for this? Iโ€™m seeing the same issue with deploying in Kubernetes.