|
import json |
|
import streamlit as st |
|
from google.oauth2 import service_account |
|
from google.cloud import language_v1 |
|
import requests |
|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
|
|
def sample_classify_text(text_content): |
|
try: |
|
service_account_info = json.loads(st.secrets["google_nlp"]) |
|
except json.JSONDecodeError: |
|
st.error("Invalid or empty JSON in 'google_nlp' secret.") |
|
return [] |
|
|
|
credentials = service_account.Credentials.from_service_account_info( |
|
service_account_info, scopes=["https://www.googleapis.com/auth/cloud-platform"] |
|
) |
|
|
|
client = language_v1.LanguageServiceClient(credentials=credentials) |
|
document = {"content": text_content, "type_": language_v1.Document.Type.PLAIN_TEXT, "language": "en"} |
|
|
|
content_categories_version = ( |
|
language_v1.ClassificationModelOptions.V2Model.ContentCategoriesVersion.V2 |
|
) |
|
response = client.classify_text( |
|
request={ |
|
"document": document, |
|
"classification_model_options": { |
|
"v2_model": {"content_categories_version": content_categories_version} |
|
}, |
|
} |
|
) |
|
|
|
return [(category.name, category.confidence) for category in response.categories] |
|
|
|
def fetch_urls_from_sitemap(sitemap_url): |
|
response = requests.get(sitemap_url) |
|
soup = BeautifulSoup(response.content, 'xml') |
|
urls = [element.text for element in soup.find_all('loc')] |
|
return urls |
|
|
|
def fetch_text_from_url(url): |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
text = soup.get_text() |
|
normalized_text = ' '.join(text.split()) |
|
return normalized_text |
|
|
|
def main(): |
|
st.title("URL Sitemap Analyzer") |
|
|
|
sitemap_url = st.text_input("Enter the sitemap URL") |
|
if st.button("Analyze Sitemap"): |
|
if sitemap_url: |
|
st.write("Fetching URLs from sitemap...") |
|
urls = fetch_urls_from_sitemap(sitemap_url) |
|
st.write(f"Found {len(urls)} URLs") |
|
|
|
results = [] |
|
for url in urls: |
|
st.write(f"Analyzing URL: {url}") |
|
text_content = fetch_text_from_url(url) |
|
categories = sample_classify_text(text_content) |
|
for category, confidence in categories: |
|
results.append({"URL": url, "Category": category, "Confidence": confidence}) |
|
|
|
if results: |
|
df = pd.DataFrame(results) |
|
df.to_csv('url_classification_results.csv', index=False) |
|
st.write("Analysis complete. Results saved to 'url_classification_results.csv'") |
|
st.dataframe(df) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|