File size: 2,695 Bytes
5d6c700
 
 
 
50ade8b
 
 
5d6c700
 
f2bf6ab
 
 
 
50ade8b
f2bf6ab
5d6c700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50ade8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d6c700
50ade8b
 
 
 
 
 
 
 
 
 
 
 
 
 
5d6c700
50ade8b
 
 
 
 
5d6c700
50ade8b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import json
import streamlit as st
from google.oauth2 import service_account
from google.cloud import language_v1
import requests
from bs4 import BeautifulSoup
import pandas as pd

def sample_classify_text(text_content):
    try:
        service_account_info = json.loads(st.secrets["google_nlp"])
    except json.JSONDecodeError:
        st.error("Invalid or empty JSON in 'google_nlp' secret.")
        return []

    credentials = service_account.Credentials.from_service_account_info(
        service_account_info, scopes=["https://www.googleapis.com/auth/cloud-platform"]
    )
    
    client = language_v1.LanguageServiceClient(credentials=credentials)
    document = {"content": text_content, "type_": language_v1.Document.Type.PLAIN_TEXT, "language": "en"}
    
    content_categories_version = (
        language_v1.ClassificationModelOptions.V2Model.ContentCategoriesVersion.V2
    )
    response = client.classify_text(
        request={
            "document": document,
            "classification_model_options": {
                "v2_model": {"content_categories_version": content_categories_version}
            },
        }
    )

    return [(category.name, category.confidence) for category in response.categories]

def fetch_urls_from_sitemap(sitemap_url):
    response = requests.get(sitemap_url)
    soup = BeautifulSoup(response.content, 'xml')
    urls = [element.text for element in soup.find_all('loc')]
    return urls

def fetch_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()
    normalized_text = ' '.join(text.split())
    return normalized_text

def main():
    st.title("URL Sitemap Analyzer")

    sitemap_url = st.text_input("Enter the sitemap URL")
    if st.button("Analyze Sitemap"):
        if sitemap_url:
            st.write("Fetching URLs from sitemap...")
            urls = fetch_urls_from_sitemap(sitemap_url)
            st.write(f"Found {len(urls)} URLs")
            
            results = []
            for url in urls:
                st.write(f"Analyzing URL: {url}")
                text_content = fetch_text_from_url(url)
                categories = sample_classify_text(text_content)
                for category, confidence in categories:
                    results.append({"URL": url, "Category": category, "Confidence": confidence})

            if results:
                df = pd.DataFrame(results)
                df.to_csv('url_classification_results.csv', index=False)
                st.write("Analysis complete. Results saved to 'url_classification_results.csv'")
                st.dataframe(df)

if __name__ == "__main__":
    main()