) content.
- JavaScript execution
Executes a JS snippet to directly extract visible text content from DOM nodes of stDataFrame.
Performs a scroll to the bottom and back to top to ensure lazy loading is triggered.
Even after using these methods i am not able to extract the stDataframe data. I get at max 18-19 rows while the df has 90 rows.
I also tried to automate the download button click (which appears when using st.download_button). Selenium can click the button, but:
It triggers a browser-level file storage permission prompt (even with headless mode or browser flags).
This interrupts automation, making it unsuitable for cron or background jobs.
I’d appreciate any help from the community on:
- More robust ways to extract all rows from a st.dataframe.
- Tips on how others have handled scraping streamlit apps(especially st.dataframe) at regular intervals when no API access is available.
Ive attached code for main.py and sel_scrap.py
main.py:
import streamlit as st
import pandas as pd
import numpy as np
st.set_page_config(page_title=“Random Data Table”, layout=“wide”)
st.title(“Random Data Table”)
st.write(“A table with 3 columns and 90 rows of random data”)
np.random.seed(42)
data = {
‘Name’: [f"Person_{i+1}" for i in range(90)],
‘Age’: np.random.randint(18, 80, 90),
‘Score’: np.round(np.random.uniform(0, 100, 90), 2)
}
df = pd.DataFrame(data)
st.dataframe(df, use_container_width=True, height=600)
st.subheader(“Basic Statistics”)
col1, col2, col3 = st.columns(3)
with col1:
st.metric(“Total Rows”, len(df))
with col2:
st.metric(“Average Age”, f"{df[‘Age’].mean():.1f}")
with col3:
st.metric(“Average Score”, f"{df[‘Score’].mean():.2f}")
csv = df.to_csv(index=False)
st.download_button(
label=“Download CSV”,
data=csv,
file_name=“random_data.csv”,
mime=“text/csv”
)
sel_scrap.py:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
import re
def scrape_streamlit_table():
chrome_options = Options()
# chrome_options.add_argument(“–headless”) # Uncomment to run headless
chrome_options.add_argument(“–no-sandbox”)
chrome_options.add_argument(“–disable-dev-shm-usage”)
chrome_options.add_argument(“–disable-web-security”)
chrome_options.add_argument(“–disable-features=VizDisplayCompositor”)
chrome_options.add_argument(“–disable-blink-features=AutomationControlled”)
chrome_options.add_experimental_option(“excludeSwitches”, [“enable-automation”])
chrome_options.add_experimental_option(‘useAutomationExtension’, False)
driver = webdriver.Chrome(options=chrome_options)
try:
url = "http://localhost:8501"
print(f"Navigating to {url}...")
driver.get(url)
wait = WebDriverWait(driver, 20)
print("Waiting for Streamlit to fully load...")
time.sleep(5)
dataframe_found = False
all_data = []
headers = []
#Method 1: Handle virtualized Streamlit dataframe with scrolling
try:
print("Method 1: Looking for Streamlit df with virtualization handling")
dataframe_element = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='stDataFrame']"))
)
time.sleep(3)
scrollable_container = None
try:
#try diff selectors for scrollable area
scrollable_selectors = [
".ag-body-viewport",
".ag-center-cols-viewport",
"[class*='viewport']",
"[class*='scroll']",
".ag-body-horizontal-scroll-viewport",
".ag-body-vertical-scroll-viewport"
]
for selector in scrollable_selectors:
try:
scrollable_container = dataframe_element.find_element(By.CSS_SELECTOR, selector)
print(f"Found scrollable container with selector: {selector}")
break
except:
continue
if not scrollable_container:
scrollable_container = dataframe_element
print("Using dataframe element as scrollable container")
except Exception as e:
print(f"Could not find specific scrollable container: {e}")
scrollable_container = dataframe_element
def extract_current_data():
try:
rows = dataframe_element.find_elements(By.CSS_SELECTOR, "[role='row']")
current_data = []
current_headers = []
if rows:
#get headers from first row
first_row = rows[0]
header_cells = first_row.find_elements(By.CSS_SELECTOR, "[role='columnheader'], [role='gridcell']")
current_headers = [cell.get_attribute('innerText') or cell.text for cell in header_cells if (cell.get_attribute('innerText') or cell.text).strip()]
for row in rows[1:]:
cells = row.find_elements(By.CSS_SELECTOR, "[role='gridcell']")
row_data = [cell.get_attribute('innerText') or cell.text for cell in cells]
if any(cell.strip() for cell in row_data):
current_data.append(row_data)
return current_headers, current_data
except Exception as e:
print(f"Error extracting current data: {e}")
return [], []
headers, initial_data = extract_current_data()
print(f"Initial extraction - Headers: {headers}, Rows: {len(initial_data)}")
if not headers:
raise Exception("Could not extract headers")
seen_rows = set()
all_data = []
for row in initial_data:
row_key = tuple(row)
if row_key not in seen_rows:
seen_rows.add(row_key)
all_data.append(row)
print(f"Starting with {len(all_data)} unique rows")
#sroll through the table to load all data
last_count = 0
no_change_count = 0
max_scrolls = 200
scroll_count = 0
while scroll_count < max_scrolls:
scroll_count += 1
#scroll down in the container
try:
driver.execute_script("arguments[0].scrollTop += 500;", scrollable_container)
time.sleep(0.5)
actions = ActionChains(driver)
actions.move_to_element(scrollable_container).perform()
actions.send_keys(Keys.PAGE_DOWN).perform()
time.sleep(0.3)
except Exception as e:
print(f"Scrolling method failed: {e}")
driver.execute_script("window.scrollBy(0, 300);")
time.sleep(0.5)
_, current_data = extract_current_data()
new_rows_added = 0
for row in current_data:
row_key = tuple(row)
if row_key not in seen_rows:
seen_rows.add(row_key)
all_data.append(row)
new_rows_added += 1
print(f"Scroll {scroll_count}: Found {len(current_data)} current rows, Added {new_rows_added} new rows, Total unique: {len(all_data)}")
if len(all_data) == last_count:
no_change_count += 1
if no_change_count >= 5: #stop if no new data for 5 consecutive scrolls
print("No new data found for 5 consecutive scrolls, stopping...")
break
else:
no_change_count = 0
last_count = len(all_data)
#scrolling to the bottom to ensure we get everything
if scroll_count % 10 == 0:
try:
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", scrollable_container)
time.sleep(1)
driver.execute_script("arguments[0].scrollTop = 0;", scrollable_container)
time.sleep(1)
except:
pass
if all_data:
dataframe_found = True
print(f"Successfully extracted {len(all_data)} rows")
except Exception as e:
print(f"Method 1 failed: {e}")
import traceback
traceback.print_exc()
#method 2:looking for HTML table
if not dataframe_found:
try:
print("Method 2:looking for HTML table...")
tables = driver.find_elements(By.TAG_NAME, "table")
if tables:
table = tables[0]
header_elements = table.find_elements(By.CSS_SELECTOR, "th")
if header_elements:
headers = [th.text.strip() for th in header_elements if th.text.strip()]
row_elements = table.find_elements(By.CSS_SELECTOR, "tbody tr")
if not row_elements:
row_elements = table.find_elements(By.CSS_SELECTOR, "tr")[1:] # Skip header
for row in row_elements:
cells = row.find_elements(By.CSS_SELECTOR, "td")
row_data = [cell.text.strip() for cell in cells]
if any(cell for cell in row_data):
all_data.append(row_data)
if all_data:
dataframe_found = True
except Exception as e:
print(f"Fallback Method 2 failed: {e}")
#method 3:js extraction with scrolling
if not dataframe_found:
try:
print("Fallback Method 3: JavaScript extraction with scrolling...")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
driver.execute_script("window.scrollTo(0, 0);")
time.sleep(2)
#execute js to find all table data
script = """
var result = {headers: [], data: []};
var dataframes = document.querySelectorAll('[data-testid="stDataFrame"]');
if (dataframes.length > 0) {
var df = dataframes[0];
//get all row elements
var rows = df.querySelectorAll('[role="row"]');
if (rows.length > 0) {
//get headers
var headerCells = rows[0].querySelectorAll('[role="columnheader"], [role="gridcell"]');
for (var i = 0; i < headerCells.length; i++) {
var headerText = headerCells[i].innerText || headerCells[i].textContent || '';
if (headerText.trim()) {
result.headers.push(headerText.trim());
}
}
//get data rows
for (var i = 1; i < rows.length; i++) {
var cells = rows[i].querySelectorAll('[role="gridcell"]');
var rowData = [];
for (var j = 0; j < cells.length; j++) {
var cellText = cells[j].innerText || cells[j].textContent || '';
rowData.push(cellText.trim());
}
if (rowData.some(cell => cell !== '')) {
result.data.push(rowData);
}
}
}
}
return result;
"""
js_result = driver.execute_script(script)
if js_result and js_result['headers'] and js_result['data']:
headers = js_result['headers']
all_data = js_result['data']
dataframe_found = True
print(f"JavaScript extraction found {len(all_data)} rows")
except Exception as e:
print(f"Method 3 failed: {e}")
if not dataframe_found:
print("Page source preview:")
print(driver.page_source[:2000])
return None
print(f"Extracted data:")
print(f"Headers: {headers}")
print(f"Data rows: {len(all_data)}")
if all_data and headers:
max_cols = len(headers)
cleaned_rows = []
for row in all_data:
#ensure row has same number of columns as headers
while len(row) < max_cols:
row.append('')
cleaned_rows.append(row[:max_cols])
df = pd.DataFrame(cleaned_rows, columns=headers)
#clean numeric columns
for col in df.columns:
if col.lower() in ['age', 'score']:
try:
df[col] = pd.to_numeric(df[col])
except (ValueError, TypeError):
pass #keep as string if conversion fails
else:
print("No valid data")
return None
csv_filename = "scraped_streamlit_data.csv"
df.to_csv(csv_filename, index=False)
print(f"Data saved to {csv_filename}")
print(f"\nDataFrame shape: {df.shape}")
print(f"Total rows extracted: {len(df)}")
return df
except Exception as e:
print(f"An error occurred: {str(e)}")
import traceback
traceback.print_exc()
return None
finally:
driver.quit()
print("Browser closed.")
if name == “main”:
print(“Starting”)
scraped_data = scrape_streamlit_table()
if scraped_data is not None:
print("Scraping completed")
print(f"Final result: {len(scraped_data)} rows extracted")
else:
print("Scraping failed")
|