Automating Streamlit dataframe scraping using Selenium – Need help with a reliable extraction method

Hi Streamlit community,
I’m working on a project where I need to automate the extraction of data from a Streamlit app every hour. Unfortunately, I do not have access to the API or any backend source of the data — my only option is to interact with the frontend where a st.dataframe is rendered.
My objective is to scrape the data every hour and save it to a CSV, as part of a scheduled job (e.g., cron). I’ve attempted to automate this through the browser using Selenium.
I have replicated the issue using these 2 files:

  1. main.py: This is my local Streamlit app which generates and displays the dataframe.
  2. sel_scrap.py: A selenium based script I’ve developed to extract the data displayed in the app.

The selenium scraper includes 3 strategies for scraping the streamlit df:

  1. Virtualized st.dataframe handling (AG-Grid):

Locates the data container using data-testid=“stDataFrame”.

Uses various known selectors (.ag-body-viewport, .ag-center-cols-viewport, etc.) to scroll through the virtualized content.

Extracts header and row data using role=“row”, role=“gridcell”, and role=“columnheader”.

Uses scroll + PAGE_DOWN combo to trigger loading of more rows.

Deduplicates rows and continues scrolling until no new rows are found.

  1. HTML Table Detection

If the grid isn’t detected, it searches for standard HTML

tags.

Extracts header (

) and body () content.

  1. JavaScript execution

Executes a JS snippet to directly extract visible text content from DOM nodes of stDataFrame.

Performs a scroll to the bottom and back to top to ensure lazy loading is triggered.

Even after using these methods i am not able to extract the stDataframe data. I get at max 18-19 rows while the df has 90 rows.

I also tried to automate the download button click (which appears when using st.download_button). Selenium can click the button, but:

It triggers a browser-level file storage permission prompt (even with headless mode or browser flags).
This interrupts automation, making it unsuitable for cron or background jobs.

I’d appreciate any help from the community on:

  1. More robust ways to extract all rows from a st.dataframe.
  2. Tips on how others have handled scraping streamlit apps(especially st.dataframe) at regular intervals when no API access is available.

Ive attached code for main.py and sel_scrap.py

main.py:

import streamlit as st
import pandas as pd
import numpy as np
st.set_page_config(page_title=“Random Data Table”, layout=“wide”)
st.title(“Random Data Table”)
st.write(“A table with 3 columns and 90 rows of random data”)

np.random.seed(42)
data = {
‘Name’: [f"Person_{i+1}" for i in range(90)],
‘Age’: np.random.randint(18, 80, 90),
‘Score’: np.round(np.random.uniform(0, 100, 90), 2)
}

df = pd.DataFrame(data)
st.dataframe(df, use_container_width=True, height=600)

st.subheader(“Basic Statistics”)
col1, col2, col3 = st.columns(3)

with col1:
st.metric(“Total Rows”, len(df))

with col2:
st.metric(“Average Age”, f"{df[‘Age’].mean():.1f}")

with col3:
st.metric(“Average Score”, f"{df[‘Score’].mean():.2f}")

csv = df.to_csv(index=False)
st.download_button(
label=“Download CSV”,
data=csv,
file_name=“random_data.csv”,
mime=“text/csv”
)

sel_scrap.py:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
import re

def scrape_streamlit_table():
chrome_options = Options()
# chrome_options.add_argument(“–headless”) # Uncomment to run headless
chrome_options.add_argument(“–no-sandbox”)
chrome_options.add_argument(“–disable-dev-shm-usage”)
chrome_options.add_argument(“–disable-web-security”)
chrome_options.add_argument(“–disable-features=VizDisplayCompositor”)
chrome_options.add_argument(“–disable-blink-features=AutomationControlled”)
chrome_options.add_experimental_option(“excludeSwitches”, [“enable-automation”])
chrome_options.add_experimental_option(‘useAutomationExtension’, False)

driver = webdriver.Chrome(options=chrome_options)

try:

    url = "http://localhost:8501"
    print(f"Navigating to {url}...")
    driver.get(url)
    
    wait = WebDriverWait(driver, 20)
    
    print("Waiting for Streamlit to fully load...")
    time.sleep(5)
    
    
    dataframe_found = False
    all_data = []
    headers = []
    
    #Method 1: Handle virtualized Streamlit dataframe with scrolling
    try:
        print("Method 1: Looking for Streamlit df with virtualization handling")
        dataframe_element = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='stDataFrame']"))
        )
        
        time.sleep(3)
        
    
        scrollable_container = None
        try:
            #try diff selectors for scrollable area
            scrollable_selectors = [
                ".ag-body-viewport",
                ".ag-center-cols-viewport", 
                "[class*='viewport']",
                "[class*='scroll']",
                ".ag-body-horizontal-scroll-viewport",
                ".ag-body-vertical-scroll-viewport"
            ]
            
            for selector in scrollable_selectors:
                try:
                    scrollable_container = dataframe_element.find_element(By.CSS_SELECTOR, selector)
                    print(f"Found scrollable container with selector: {selector}")
                    break
                except:
                    continue
                    
            if not scrollable_container:
                scrollable_container = dataframe_element
                print("Using dataframe element as scrollable container")
                
        except Exception as e:
            print(f"Could not find specific scrollable container: {e}")
            scrollable_container = dataframe_element
        
        def extract_current_data():
            try:
                rows = dataframe_element.find_elements(By.CSS_SELECTOR, "[role='row']")
                current_data = []
                current_headers = []
                
                if rows:
                    #get headers from first row
                    first_row = rows[0]
                    header_cells = first_row.find_elements(By.CSS_SELECTOR, "[role='columnheader'], [role='gridcell']")
                    current_headers = [cell.get_attribute('innerText') or cell.text for cell in header_cells if (cell.get_attribute('innerText') or cell.text).strip()]
                    
                    
                    for row in rows[1:]:
                        cells = row.find_elements(By.CSS_SELECTOR, "[role='gridcell']")
                        row_data = [cell.get_attribute('innerText') or cell.text for cell in cells]
                        if any(cell.strip() for cell in row_data):
                            current_data.append(row_data)
                
                return current_headers, current_data
            except Exception as e:
                print(f"Error extracting current data: {e}")
                return [], []
        
        
        headers, initial_data = extract_current_data()
        print(f"Initial extraction - Headers: {headers}, Rows: {len(initial_data)}")
        
        if not headers:
            raise Exception("Could not extract headers")
        
        seen_rows = set()
        all_data = []
        
        
        for row in initial_data:
            row_key = tuple(row)
            if row_key not in seen_rows:
                seen_rows.add(row_key)
                all_data.append(row)
        
        print(f"Starting with {len(all_data)} unique rows")
        
        #sroll through the table to load all data
        last_count = 0
        no_change_count = 0
        max_scrolls = 200 
        scroll_count = 0
        
        while scroll_count < max_scrolls:
            scroll_count += 1
            
            #scroll down in the container
            try:
            
                driver.execute_script("arguments[0].scrollTop += 500;", scrollable_container)
                time.sleep(0.5) 
                
                actions = ActionChains(driver)
                actions.move_to_element(scrollable_container).perform()
                actions.send_keys(Keys.PAGE_DOWN).perform()
                time.sleep(0.3)
                
            except Exception as e:
                print(f"Scrolling method failed: {e}")
                driver.execute_script("window.scrollBy(0, 300);")
                time.sleep(0.5)
            _, current_data = extract_current_data()
            
        
            new_rows_added = 0
            for row in current_data:
                row_key = tuple(row)
                if row_key not in seen_rows:
                    seen_rows.add(row_key)
                    all_data.append(row)
                    new_rows_added += 1
            
            print(f"Scroll {scroll_count}: Found {len(current_data)} current rows, Added {new_rows_added} new rows, Total unique: {len(all_data)}")
            
            
            if len(all_data) == last_count:
                no_change_count += 1
                if no_change_count >= 5:  #stop if no new data for 5 consecutive scrolls
                    print("No new data found for 5 consecutive scrolls, stopping...")
                    break
            else:
                no_change_count = 0
                last_count = len(all_data)
            
            #scrolling to the bottom to ensure we get everything
            if scroll_count % 10 == 0:
                try:
                    driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", scrollable_container)
                    time.sleep(1)
                    driver.execute_script("arguments[0].scrollTop = 0;", scrollable_container)
                    time.sleep(1)
                except:
                    pass
        
        if all_data:
            dataframe_found = True
            print(f"Successfully extracted {len(all_data)} rows")
            
    except Exception as e:
        print(f"Method 1 failed: {e}")
        import traceback
        traceback.print_exc()
    
    #method 2:looking for HTML table
    if not dataframe_found:
        try:
            print("Method 2:looking for HTML table...")
            tables = driver.find_elements(By.TAG_NAME, "table")
            
            if tables:
                table = tables[0]
                
                header_elements = table.find_elements(By.CSS_SELECTOR, "th")
                if header_elements:
                    headers = [th.text.strip() for th in header_elements if th.text.strip()]
                
                
                row_elements = table.find_elements(By.CSS_SELECTOR, "tbody tr")
                if not row_elements:
                    row_elements = table.find_elements(By.CSS_SELECTOR, "tr")[1:]  # Skip header
                
                for row in row_elements:
                    cells = row.find_elements(By.CSS_SELECTOR, "td")
                    row_data = [cell.text.strip() for cell in cells]
                    if any(cell for cell in row_data):
                        all_data.append(row_data)
                
                if all_data:
                    dataframe_found = True
                    
        except Exception as e:
            print(f"Fallback Method 2 failed: {e}")
    
    #method 3:js extraction with scrolling
    if not dataframe_found:
        try:
            print("Fallback Method 3: JavaScript extraction with scrolling...")
            
        
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(2)
            
            #execute js to find all table data
            script = """
            var result = {headers: [], data: []};
            
        
            var dataframes = document.querySelectorAll('[data-testid="stDataFrame"]');
            
            if (dataframes.length > 0) {
                var df = dataframes[0];
                
                //get all row elements
                var rows = df.querySelectorAll('[role="row"]');
                
                if (rows.length > 0) {
                    //get headers
                    var headerCells = rows[0].querySelectorAll('[role="columnheader"], [role="gridcell"]');
                    for (var i = 0; i < headerCells.length; i++) {
                        var headerText = headerCells[i].innerText || headerCells[i].textContent || '';
                        if (headerText.trim()) {
                            result.headers.push(headerText.trim());
                        }
                    }
                    
                    //get data rows
                    for (var i = 1; i < rows.length; i++) {
                        var cells = rows[i].querySelectorAll('[role="gridcell"]');
                        var rowData = [];
                        
                        for (var j = 0; j < cells.length; j++) {
                            var cellText = cells[j].innerText || cells[j].textContent || '';
                            rowData.push(cellText.trim());
                        }
                        
                        if (rowData.some(cell => cell !== '')) {
                            result.data.push(rowData);
                        }
                    }
                }
            }
            
            return result;
            """
            
            js_result = driver.execute_script(script)
            
            if js_result and js_result['headers'] and js_result['data']:
                headers = js_result['headers']
                all_data = js_result['data']
                dataframe_found = True
                print(f"JavaScript extraction found {len(all_data)} rows")
                
        except Exception as e:
            print(f"Method 3 failed: {e}")
    
    if not dataframe_found:
        print("Page source preview:")
        print(driver.page_source[:2000])
        return None
    
    print(f"Extracted data:")
    print(f"Headers: {headers}")
    print(f"Data rows: {len(all_data)}")
    
    if all_data and headers:
        max_cols = len(headers)
        cleaned_rows = []
        
        for row in all_data:
            #ensure row has same number of columns as headers
            while len(row) < max_cols:
                row.append('')
            cleaned_rows.append(row[:max_cols])
        
        df = pd.DataFrame(cleaned_rows, columns=headers)
        
        #clean numeric columns
        for col in df.columns:
            if col.lower() in ['age', 'score']:
                try:
                    df[col] = pd.to_numeric(df[col])
                except (ValueError, TypeError):
                    pass  #keep as string if conversion fails
        
    else:
        print("No valid data")
        return None
    
    
    csv_filename = "scraped_streamlit_data.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Data saved to {csv_filename}")
    
    print(f"\nDataFrame shape: {df.shape}")
    print(f"Total rows extracted: {len(df)}")
    
    return df
    
except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    traceback.print_exc()
    return None
    
finally:
    driver.quit()
    print("Browser closed.")

if name == “main”:
print(“Starting”)

scraped_data = scrape_streamlit_table()

if scraped_data is not None:
    print("Scraping completed")
    print(f"Final result: {len(scraped_data)} rows extracted")
else:
    print("Scraping failed")