Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
from nltk.tokenize import word_tokenize
|
7 |
+
from nltk.util import ngrams
|
8 |
+
from nltk.stem import PorterStemmer
|
9 |
+
import pandas as pd
|
10 |
+
import nltk
|
11 |
+
import string
|
12 |
+
import io
|
13 |
+
import os
|
14 |
+
|
15 |
+
api_key = os.getenv("API_KEY")
|
16 |
+
|
17 |
+
# ==========1- NLTK DOWNLOADS=========
|
18 |
+
def ensure_nltk_data():
|
19 |
+
resources = [
|
20 |
+
("tokenizers/punkt", "punkt"),
|
21 |
+
("corpora/stopwords", "stopwords"),
|
22 |
+
("tokenizers/punkt_tab", "punkt_tab")
|
23 |
+
]
|
24 |
+
for resource_path, download_name in resources:
|
25 |
+
try:
|
26 |
+
nltk.data.find(resource_path)
|
27 |
+
except LookupError:
|
28 |
+
nltk.download(download_name)
|
29 |
+
|
30 |
+
ensure_nltk_data()
|
31 |
+
|
32 |
+
# =======2-EXTRACT FUNCTION WITH USER AGENT==========
|
33 |
+
def extract_blog_content(url):
|
34 |
+
headers = {
|
35 |
+
"User-Agent": (
|
36 |
+
"Mozilla/5.0 (Windows NT 10.0; rv:105.0) "
|
37 |
+
"Gecko/20100101 Firefox/105.0"
|
38 |
+
)
|
39 |
+
}
|
40 |
+
response = requests.get(url, headers=headers)
|
41 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
42 |
+
|
43 |
+
meta_title = soup.find('meta', attrs={'name': 'title'}) or soup.find('title')
|
44 |
+
meta_description = soup.find('meta', attrs={'name': 'description'})
|
45 |
+
|
46 |
+
meta_title = meta_title['content'] if meta_title and 'content' in meta_title.attrs else ''
|
47 |
+
meta_description = meta_description['content'] if meta_description and 'content' in meta_description.attrs else ''
|
48 |
+
|
49 |
+
article_title_element = soup.find('h1')
|
50 |
+
article_title = article_title_element.get_text(strip=True) if article_title_element else ''
|
51 |
+
|
52 |
+
blog_text = " ".join([p.get_text() for p in soup.find_all('p')])
|
53 |
+
return meta_title, meta_description, article_title, blog_text
|
54 |
+
|
55 |
+
#========3- PREPROCESSING + TF-IDF LOGIC=======
|
56 |
+
def preprocess_text(text):
|
57 |
+
stop_words = set(stopwords.words('english'))
|
58 |
+
stemmer = PorterStemmer()
|
59 |
+
|
60 |
+
tokens = word_tokenize(text.lower())
|
61 |
+
tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
|
62 |
+
return tokens
|
63 |
+
|
64 |
+
def generate_ngrams(tokens, max_n=3):
|
65 |
+
all_ngrams = []
|
66 |
+
for n in range(1, max_n + 1):
|
67 |
+
ngrams_list = [" ".join(gram) for gram in ngrams(tokens, n)]
|
68 |
+
all_ngrams.extend(ngrams_list)
|
69 |
+
return all_ngrams
|
70 |
+
# ======= 4-KEYWORD TOOL API + SELECTION LOGIC ==========
|
71 |
+
def get_keyword_metrics(keywords):
|
72 |
+
if not keywords:
|
73 |
+
st.error("No keywords to process.")
|
74 |
+
return {}
|
75 |
+
url = "https://api.keywordtool.io/v2/search/volume/google"
|
76 |
+
payload = {
|
77 |
+
"metrics_network": "googlesearchnetwork",
|
78 |
+
"metrics_currency": "USD",
|
79 |
+
"complete": False,
|
80 |
+
"output": "json",
|
81 |
+
"apikey": api_key,
|
82 |
+
"keyword": keywords
|
83 |
+
}
|
84 |
+
headers = {"content-type": "application/json"}
|
85 |
+
response = requests.post(url, json=payload, headers=headers)
|
86 |
+
if response.status_code == 200:
|
87 |
+
return response.json()
|
88 |
+
else:
|
89 |
+
st.error("API Error: " + response.text)
|
90 |
+
return {}
|
91 |
+
def select_top_keywords(metrics_response, percentage, scored_keywords):
|
92 |
+
keywords_data = metrics_response.get('results', {})
|
93 |
+
keyword_scores = []
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
for keyword, data in keywords_data.items():
|
98 |
+
search_volume = data.get('volume', 0) or 0
|
99 |
+
trend = data.get('trend', 0) or 0
|
100 |
+
cpc = data.get('cpc', 0) or 0
|
101 |
+
competition = data.get('cmp', 0) or 0
|
102 |
+
|
103 |
+
tfidf_score = next((score for kw, score in scored_keywords if kw == keyword), 0)
|
104 |
+
percentage_score = tfidf_score * 100 # Convert to percentage
|
105 |
+
keyword_scores.append((keyword, percentage_score, search_volume, trend, cpc, competition))
|
106 |
+
|
107 |
+
sorted_keywords = sorted(keyword_scores, key=lambda x: x[1], reverse=True)
|
108 |
+
top_count = max(1, int(len(sorted_keywords) * (percentage / 100)))
|
109 |
+
return sorted_keywords[:top_count]
|
110 |
+
|
111 |
+
# =======5-UI & LOGIC FLOW==========
|
112 |
+
|
113 |
+
st.title("Keyword Analysis Tool")
|
114 |
+
|
115 |
+
# A. URL input
|
116 |
+
url = st.text_input("Enter the URL:", key="url_input")
|
117 |
+
|
118 |
+
if "meta_title" not in st.session_state:
|
119 |
+
st.session_state.meta_title = ""
|
120 |
+
if "meta_description" not in st.session_state:
|
121 |
+
st.session_state.meta_description = ""
|
122 |
+
if "article_title" not in st.session_state:
|
123 |
+
st.session_state.article_title = ""
|
124 |
+
if "article_text" not in st.session_state:
|
125 |
+
st.session_state.article_text = ""
|
126 |
+
|
127 |
+
# B- Step 1: Fetch Data
|
128 |
+
if st.button("Fetch Data"):
|
129 |
+
if url.strip():
|
130 |
+
meta_title, meta_description, article_title, blog_text = extract_blog_content(url)
|
131 |
+
st.session_state.meta_title = meta_title
|
132 |
+
st.session_state.meta_description = meta_description
|
133 |
+
st.session_state.article_title = article_title
|
134 |
+
st.session_state.article_text = blog_text
|
135 |
+
else:
|
136 |
+
st.error("Please enter a valid URL.")
|
137 |
+
|
138 |
+
# C-Show the fetched data so user can modify
|
139 |
+
st.subheader("Modify Fetched Content")
|
140 |
+
st.session_state.meta_title = st.text_input("Meta Title", st.session_state.meta_title)
|
141 |
+
st.session_state.meta_description = st.text_area("Meta Description", st.session_state.meta_description)
|
142 |
+
st.session_state.article_title = st.text_input("Article Title", st.session_state.article_title)
|
143 |
+
st.session_state.article_text = st.text_area("Article Text", st.session_state.article_text)
|
144 |
+
|
145 |
+
# D- Checkboxes to select which parts to analyze
|
146 |
+
include_meta_title = st.checkbox("Include Meta Title")
|
147 |
+
include_meta_description = st.checkbox("Include Meta Description")
|
148 |
+
include_article_title = st.checkbox("Include Article Title")
|
149 |
+
include_article_text = st.checkbox("Include Article Text")
|
150 |
+
|
151 |
+
# E- Top % of Keywords
|
152 |
+
top_percentage = st.number_input("Top % of Keywords to Display", min_value=1, max_value=100, value=100, step=1)
|
153 |
+
|
154 |
+
# F- Analyze Button -> runs the original logic
|
155 |
+
if st.button("Analyze"):
|
156 |
+
if not url.strip():
|
157 |
+
st.error("Please enter a valid URL.")
|
158 |
+
else:
|
159 |
+
selected_text = ""
|
160 |
+
if include_meta_title:
|
161 |
+
selected_text += st.session_state.meta_title + " "
|
162 |
+
if include_meta_description:
|
163 |
+
selected_text += st.session_state.meta_description + " "
|
164 |
+
if include_article_title:
|
165 |
+
selected_text += st.session_state.article_title + " "
|
166 |
+
if include_article_text:
|
167 |
+
selected_text += st.session_state.article_text
|
168 |
+
|
169 |
+
if not selected_text.strip():
|
170 |
+
st.error("No text selected for analysis. Please check at least one option.")
|
171 |
+
else:
|
172 |
+
# ========== ORIGINAL ANALYSIS LOGIC (unchanged) ==========
|
173 |
+
tokens = preprocess_text(selected_text)
|
174 |
+
ngrams_list = generate_ngrams(tokens, max_n=3)
|
175 |
+
unique_ngrams = list(set(ngrams_list))
|
176 |
+
|
177 |
+
if not unique_ngrams:
|
178 |
+
st.error("Vocabulary is empty. Please ensure valid input data.")
|
179 |
+
else:
|
180 |
+
tfidf_vectorizer = TfidfVectorizer(vocabulary=unique_ngrams)
|
181 |
+
tfidf_vectorizer.fit([" ".join(tokens)])
|
182 |
+
tfidf_scores = tfidf_vectorizer.transform([" ".join(tokens)]).toarray()[0]
|
183 |
+
|
184 |
+
scored_keywords = sorted(
|
185 |
+
zip(unique_ngrams, tfidf_scores),
|
186 |
+
key=lambda x: x[1],
|
187 |
+
reverse=True
|
188 |
+
)[:100]
|
189 |
+
|
190 |
+
keywords = [kw for kw, _ in scored_keywords]
|
191 |
+
|
192 |
+
metrics_response = get_keyword_metrics(keywords)
|
193 |
+
if metrics_response:
|
194 |
+
# Select top keywords based on user percentage
|
195 |
+
top_keywords_data = select_top_keywords(metrics_response, top_percentage, scored_keywords)
|
196 |
+
|
197 |
+
data = {
|
198 |
+
"Keyword": [k[0] for k in top_keywords_data],
|
199 |
+
"Score (%)": [f"{k[1]:.2f}" for k in top_keywords_data],
|
200 |
+
"Search Volume": [k[2] for k in top_keywords_data],
|
201 |
+
"Trend": [k[3] for k in top_keywords_data],
|
202 |
+
"CPC": [k[4] for k in top_keywords_data],
|
203 |
+
"Competition": [k[5] for k in top_keywords_data],
|
204 |
+
}
|
205 |
+
df = pd.DataFrame(data)
|
206 |
+
|
207 |
+
st.dataframe(df)
|
208 |
+
|
209 |
+
output_format = st.selectbox("Download format", ["CSV", "Excel"])
|
210 |
+
|
211 |
+
if output_format == "CSV":
|
212 |
+
csv_data = df.to_csv(index=False).encode('utf-8')
|
213 |
+
st.download_button(
|
214 |
+
label="Download CSV",
|
215 |
+
data=csv_data,
|
216 |
+
file_name="keywords.csv",
|
217 |
+
mime="text/csv",
|
218 |
+
key="download-csv",
|
219 |
+
)
|
220 |
+
else: # Excel
|
221 |
+
excel_buffer = io.BytesIO()
|
222 |
+
with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
|
223 |
+
df.to_excel(writer, index=False, sheet_name="Sheet1")
|
224 |
+
excel_data = excel_buffer.getvalue()
|
225 |
+
|
226 |
+
st.download_button(
|
227 |
+
label="Download Excel",
|
228 |
+
data=excel_data,
|
229 |
+
file_name="keywords.xlsx",
|
230 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
231 |
+
key="download-excel",
|
232 |
+
)
|