Summary
I have a dashboard deployed in the kubernetes, the application connects to azure blob storage using pyarrow (fsspec). The memory gets stable but with every refresh the memory increases a bit and gets stable, but there is increase in memory and memory does not go down after compute.
Steps to reproduce
Code snippet:
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import polars as pl
import altair as alt
import streamlit as st
@st.cache_resource
def get_fs():
return return fsspec.filesystem('abfs'), 'abfs://'
loc_data_fs, loc_data_appender = get_fs()
@st.cache_resource(ttl=6 * 3600)
def get_dataset(file_path):
connection_to_dataset = ds.dataset(file_path, filesystem=loc_data_fs, partitioning='hive')
return connection_to_dataset
@st.cache_resource(ttl=6*3600)
def load_rgs() -> pl.DataFrame:
path = RESOURCE_GROUP_PATH
ds = get_dataset(path)
df = pl.scan_pyarrow_dataset(ds).select(pl.col(["B", "C"])).collect()
return df
def main():
subs = load_subs()
# static assets never loaded again and thus cached
rgs = load_rgs()
mcateg = load_metercategories()
# dynamic selections begin
col1, col2, col3, col4, col5 = st.columns([2, 2, 1, 1, 1])
dt_end = get_max_date()
with col1:
sel_a = st.multiselect("B", subs)
with col2:
r = rgs.filter(pl.col("B").is_in(sel_subs)).select(
pl.col("C")).drop_nulls().to_pandas()
sel_b = st.multiselect("C", r)
with col3:
ds = st.date_input("StartDate",
dt_end - datetime.timedelta(30)) # datetime.date(2023, 1, 1))
with col4:
de = st.date_input("EndDate", dt_end) # datetime.date(2023, 1, 1))
with col5:
sel_mc = st.multiselect("Meter Category", mcateg.to_pandas())
filters = []
filters.append(('A', '>=', ds.strftime('%Y-%m-%d')))
filters.append(('A', '<=', de.strftime('%Y-%m-%d')))
if not sel_a:
filters.append(('B', 'in', sel_a))
if sel_b:
filters.append(('C', 'in', sel_b))
# df = df.filter(pl.col("C").is_in(sel_b))
if sel_mc:
filters.append(('E', 'in', sel_mc))
# df = df.filter(pl.col("E").is_in(sel_mc))
df = pl.from_pandas(
pq.read_table(METRICS_PATH,
partitioning='hive',
filters=filters,
filesystem=loc_data_fs,
columns=[
"A", "B", "C", "D",
"E", "F"
]).to_pandas())
col1, col2, col3, col4, col5 = st.columns(5)
with col1:
st.metric("Subs", len(sel_a))
with col2:
if not sel_b:
sel_rgs = rgs.to_pandas()['C'].to_list()
st.metric("Resource Groups", len(sel_b))
with col3:
st.metric("Rec", df.select(pl.count("*"))[0, 0])
with col4:
st.metric("Spend",
round(df.select(pl.col("F")).sum()[0, 0], 3))
with col5:
st.metric("Data Size (mb)", round(df.estimated_size("mb")))
# charts
df_sub = df.groupby(["B",
"A"]).agg(pl.col("F").sum())
sub_chart = alt.Chart(df_sub.to_pandas()).mark_bar().encode(
x='A',
y='F',
color='B',
tooltip=['B', 'F', 'A']).interactive()
st.altair_chart(sub_chart, use_container_width=True)
dfr = df.groupby(["A", "C"]).agg(pl.col("F").sum())
reschart = alt.Chart(dfr.to_pandas()).mark_bar().encode(
x='A',
y='F',
color='C',
tooltip=['C', 'F', 'A'])
st.altair_chart(reschart, use_container_width=True)
df_prod = df.groupby(["A", "D"]).agg(pl.col("F").sum())
prod_chart = alt.Chart(df_prod.to_pandas()).mark_bar().encode(
x='A',
y='F',
color='D',
tooltip=['D', 'F', 'A'])
st.altair_chart(prod_chart, use_container_width=True)
df_mcat = df.groupby(["A",
"E"]).agg(pl.col("F").sum())
mcat_chart = alt.Chart(df_mcat.to_pandas()).mark_bar().encode(
x='A',
y='F',
color='E',
tooltip=['E', 'F', 'A'])
st.altair_chart(mcat_chart, use_container_width=True)
del df
del filters
del prod_chart
del df_prod
del df_sub
del sub_chart
del reschart
del dfr
del mcat_chart
del df_mcat
if __name__ == '__main__':
print('before:', len(gc.get_objects()))
main()
gc.collect()
print('after:', len(gc.get_objects()))
These are the code snippets, it is not the complete code, but it can explain.
Expected behavior:
When I run it locally in Macos inside docker container, the memory always gets stable near 750 mib after no use, but running inside kubernetes cluster, after some web browser refreshes, the memory increases from 750 mib to 900 mib, then after some more querying the memory gets stable at around 1000 mib after a while of no use, the number of objects printed by gc.get_objects() also increases.
Actual behavior:
Explain the undesired behavior or error you see when you run the code above.
If youāre seeing an error message, share the full contents of the error message here.
Debug info
- Streamlit version: 1.25.0
- Python version: 3.9
- OS version: linux
- Browser version: chrome