Hey, I figured that there might be a conflict in how trafilatura
and streamlit
handle thread management. Although this might be cheating I finally fixed it by changing the library with which I extract the text given the URL from trafilatura
to newspaper3k
. This is my executable code:
import streamlit as st
import validators
import requests
import trafilatura
from newspaper import Article
st.set_page_config(page_title="Demo")
URL = st.sidebar.text_input('put in url', "")
URL = URL.strip()
button_clicked = st.sidebar.button("🔍")
isValid = bool(validators.url(URL))
def fetch_url(url):
article = Article(url)
article.download()
article.parse()
return article
def extract_text_from_url(URL):
resp = requests.get(URL)
if resp.ok:
article = ""
fetched_article = fetch_url(URL)
title = fetched_article.title
article = fetched_article.text
return article, title
else:
st.sidebar.error('This is not an URL. Put in valid URL.')
# MAIN-PAGE display
if button_clicked:
if isValid:
# TEXT EXTRACTION
article, title = extract_text_from_url(URL)
else:
st.sidebar.error('This is not an URL. Put in valid URL.')