blazingbunny's picture
Update app.py
50ade8b verified
raw
history blame
2.7 kB
import json
import streamlit as st
from google.oauth2 import service_account
from google.cloud import language_v1
import requests
from bs4 import BeautifulSoup
import pandas as pd
def sample_classify_text(text_content):
try:
service_account_info = json.loads(st.secrets["google_nlp"])
except json.JSONDecodeError:
st.error("Invalid or empty JSON in 'google_nlp' secret.")
return []
credentials = service_account.Credentials.from_service_account_info(
service_account_info, scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
client = language_v1.LanguageServiceClient(credentials=credentials)
document = {"content": text_content, "type_": language_v1.Document.Type.PLAIN_TEXT, "language": "en"}
content_categories_version = (
language_v1.ClassificationModelOptions.V2Model.ContentCategoriesVersion.V2
)
response = client.classify_text(
request={
"document": document,
"classification_model_options": {
"v2_model": {"content_categories_version": content_categories_version}
},
}
)
return [(category.name, category.confidence) for category in response.categories]
def fetch_urls_from_sitemap(sitemap_url):
response = requests.get(sitemap_url)
soup = BeautifulSoup(response.content, 'xml')
urls = [element.text for element in soup.find_all('loc')]
return urls
def fetch_text_from_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text()
normalized_text = ' '.join(text.split())
return normalized_text
def main():
st.title("URL Sitemap Analyzer")
sitemap_url = st.text_input("Enter the sitemap URL")
if st.button("Analyze Sitemap"):
if sitemap_url:
st.write("Fetching URLs from sitemap...")
urls = fetch_urls_from_sitemap(sitemap_url)
st.write(f"Found {len(urls)} URLs")
results = []
for url in urls:
st.write(f"Analyzing URL: {url}")
text_content = fetch_text_from_url(url)
categories = sample_classify_text(text_content)
for category, confidence in categories:
results.append({"URL": url, "Category": category, "Confidence": confidence})
if results:
df = pd.DataFrame(results)
df.to_csv('url_classification_results.csv', index=False)
st.write("Analysis complete. Results saved to 'url_classification_results.csv'")
st.dataframe(df)
if __name__ == "__main__":
main()