abdulllah01 commited on
Commit
8e29239
·
verified ·
1 Parent(s): 0449dac

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +232 -0
app.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from nltk.corpus import stopwords
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.util import ngrams
8
+ from nltk.stem import PorterStemmer
9
+ import pandas as pd
10
+ import nltk
11
+ import string
12
+ import io
13
+ import os
14
+
15
+ api_key = os.getenv("API_KEY")
16
+
17
+ # ==========1- NLTK DOWNLOADS=========
18
+ def ensure_nltk_data():
19
+ resources = [
20
+ ("tokenizers/punkt", "punkt"),
21
+ ("corpora/stopwords", "stopwords"),
22
+ ("tokenizers/punkt_tab", "punkt_tab")
23
+ ]
24
+ for resource_path, download_name in resources:
25
+ try:
26
+ nltk.data.find(resource_path)
27
+ except LookupError:
28
+ nltk.download(download_name)
29
+
30
+ ensure_nltk_data()
31
+
32
+ # =======2-EXTRACT FUNCTION WITH USER AGENT==========
33
+ def extract_blog_content(url):
34
+ headers = {
35
+ "User-Agent": (
36
+ "Mozilla/5.0 (Windows NT 10.0; rv:105.0) "
37
+ "Gecko/20100101 Firefox/105.0"
38
+ )
39
+ }
40
+ response = requests.get(url, headers=headers)
41
+ soup = BeautifulSoup(response.text, 'html.parser')
42
+
43
+ meta_title = soup.find('meta', attrs={'name': 'title'}) or soup.find('title')
44
+ meta_description = soup.find('meta', attrs={'name': 'description'})
45
+
46
+ meta_title = meta_title['content'] if meta_title and 'content' in meta_title.attrs else ''
47
+ meta_description = meta_description['content'] if meta_description and 'content' in meta_description.attrs else ''
48
+
49
+ article_title_element = soup.find('h1')
50
+ article_title = article_title_element.get_text(strip=True) if article_title_element else ''
51
+
52
+ blog_text = " ".join([p.get_text() for p in soup.find_all('p')])
53
+ return meta_title, meta_description, article_title, blog_text
54
+
55
+ #========3- PREPROCESSING + TF-IDF LOGIC=======
56
+ def preprocess_text(text):
57
+ stop_words = set(stopwords.words('english'))
58
+ stemmer = PorterStemmer()
59
+
60
+ tokens = word_tokenize(text.lower())
61
+ tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
62
+ return tokens
63
+
64
+ def generate_ngrams(tokens, max_n=3):
65
+ all_ngrams = []
66
+ for n in range(1, max_n + 1):
67
+ ngrams_list = [" ".join(gram) for gram in ngrams(tokens, n)]
68
+ all_ngrams.extend(ngrams_list)
69
+ return all_ngrams
70
+ # ======= 4-KEYWORD TOOL API + SELECTION LOGIC ==========
71
+ def get_keyword_metrics(keywords):
72
+ if not keywords:
73
+ st.error("No keywords to process.")
74
+ return {}
75
+ url = "https://api.keywordtool.io/v2/search/volume/google"
76
+ payload = {
77
+ "metrics_network": "googlesearchnetwork",
78
+ "metrics_currency": "USD",
79
+ "complete": False,
80
+ "output": "json",
81
+ "apikey": api_key,
82
+ "keyword": keywords
83
+ }
84
+ headers = {"content-type": "application/json"}
85
+ response = requests.post(url, json=payload, headers=headers)
86
+ if response.status_code == 200:
87
+ return response.json()
88
+ else:
89
+ st.error("API Error: " + response.text)
90
+ return {}
91
+ def select_top_keywords(metrics_response, percentage, scored_keywords):
92
+ keywords_data = metrics_response.get('results', {})
93
+ keyword_scores = []
94
+
95
+
96
+
97
+ for keyword, data in keywords_data.items():
98
+ search_volume = data.get('volume', 0) or 0
99
+ trend = data.get('trend', 0) or 0
100
+ cpc = data.get('cpc', 0) or 0
101
+ competition = data.get('cmp', 0) or 0
102
+
103
+ tfidf_score = next((score for kw, score in scored_keywords if kw == keyword), 0)
104
+ percentage_score = tfidf_score * 100 # Convert to percentage
105
+ keyword_scores.append((keyword, percentage_score, search_volume, trend, cpc, competition))
106
+
107
+ sorted_keywords = sorted(keyword_scores, key=lambda x: x[1], reverse=True)
108
+ top_count = max(1, int(len(sorted_keywords) * (percentage / 100)))
109
+ return sorted_keywords[:top_count]
110
+
111
+ # =======5-UI & LOGIC FLOW==========
112
+
113
+ st.title("Keyword Analysis Tool")
114
+
115
+ # A. URL input
116
+ url = st.text_input("Enter the URL:", key="url_input")
117
+
118
+ if "meta_title" not in st.session_state:
119
+ st.session_state.meta_title = ""
120
+ if "meta_description" not in st.session_state:
121
+ st.session_state.meta_description = ""
122
+ if "article_title" not in st.session_state:
123
+ st.session_state.article_title = ""
124
+ if "article_text" not in st.session_state:
125
+ st.session_state.article_text = ""
126
+
127
+ # B- Step 1: Fetch Data
128
+ if st.button("Fetch Data"):
129
+ if url.strip():
130
+ meta_title, meta_description, article_title, blog_text = extract_blog_content(url)
131
+ st.session_state.meta_title = meta_title
132
+ st.session_state.meta_description = meta_description
133
+ st.session_state.article_title = article_title
134
+ st.session_state.article_text = blog_text
135
+ else:
136
+ st.error("Please enter a valid URL.")
137
+
138
+ # C-Show the fetched data so user can modify
139
+ st.subheader("Modify Fetched Content")
140
+ st.session_state.meta_title = st.text_input("Meta Title", st.session_state.meta_title)
141
+ st.session_state.meta_description = st.text_area("Meta Description", st.session_state.meta_description)
142
+ st.session_state.article_title = st.text_input("Article Title", st.session_state.article_title)
143
+ st.session_state.article_text = st.text_area("Article Text", st.session_state.article_text)
144
+
145
+ # D- Checkboxes to select which parts to analyze
146
+ include_meta_title = st.checkbox("Include Meta Title")
147
+ include_meta_description = st.checkbox("Include Meta Description")
148
+ include_article_title = st.checkbox("Include Article Title")
149
+ include_article_text = st.checkbox("Include Article Text")
150
+
151
+ # E- Top % of Keywords
152
+ top_percentage = st.number_input("Top % of Keywords to Display", min_value=1, max_value=100, value=100, step=1)
153
+
154
+ # F- Analyze Button -> runs the original logic
155
+ if st.button("Analyze"):
156
+ if not url.strip():
157
+ st.error("Please enter a valid URL.")
158
+ else:
159
+ selected_text = ""
160
+ if include_meta_title:
161
+ selected_text += st.session_state.meta_title + " "
162
+ if include_meta_description:
163
+ selected_text += st.session_state.meta_description + " "
164
+ if include_article_title:
165
+ selected_text += st.session_state.article_title + " "
166
+ if include_article_text:
167
+ selected_text += st.session_state.article_text
168
+
169
+ if not selected_text.strip():
170
+ st.error("No text selected for analysis. Please check at least one option.")
171
+ else:
172
+ # ========== ORIGINAL ANALYSIS LOGIC (unchanged) ==========
173
+ tokens = preprocess_text(selected_text)
174
+ ngrams_list = generate_ngrams(tokens, max_n=3)
175
+ unique_ngrams = list(set(ngrams_list))
176
+
177
+ if not unique_ngrams:
178
+ st.error("Vocabulary is empty. Please ensure valid input data.")
179
+ else:
180
+ tfidf_vectorizer = TfidfVectorizer(vocabulary=unique_ngrams)
181
+ tfidf_vectorizer.fit([" ".join(tokens)])
182
+ tfidf_scores = tfidf_vectorizer.transform([" ".join(tokens)]).toarray()[0]
183
+
184
+ scored_keywords = sorted(
185
+ zip(unique_ngrams, tfidf_scores),
186
+ key=lambda x: x[1],
187
+ reverse=True
188
+ )[:100]
189
+
190
+ keywords = [kw for kw, _ in scored_keywords]
191
+
192
+ metrics_response = get_keyword_metrics(keywords)
193
+ if metrics_response:
194
+ # Select top keywords based on user percentage
195
+ top_keywords_data = select_top_keywords(metrics_response, top_percentage, scored_keywords)
196
+
197
+ data = {
198
+ "Keyword": [k[0] for k in top_keywords_data],
199
+ "Score (%)": [f"{k[1]:.2f}" for k in top_keywords_data],
200
+ "Search Volume": [k[2] for k in top_keywords_data],
201
+ "Trend": [k[3] for k in top_keywords_data],
202
+ "CPC": [k[4] for k in top_keywords_data],
203
+ "Competition": [k[5] for k in top_keywords_data],
204
+ }
205
+ df = pd.DataFrame(data)
206
+
207
+ st.dataframe(df)
208
+
209
+ output_format = st.selectbox("Download format", ["CSV", "Excel"])
210
+
211
+ if output_format == "CSV":
212
+ csv_data = df.to_csv(index=False).encode('utf-8')
213
+ st.download_button(
214
+ label="Download CSV",
215
+ data=csv_data,
216
+ file_name="keywords.csv",
217
+ mime="text/csv",
218
+ key="download-csv",
219
+ )
220
+ else: # Excel
221
+ excel_buffer = io.BytesIO()
222
+ with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
223
+ df.to_excel(writer, index=False, sheet_name="Sheet1")
224
+ excel_data = excel_buffer.getvalue()
225
+
226
+ st.download_button(
227
+ label="Download Excel",
228
+ data=excel_data,
229
+ file_name="keywords.xlsx",
230
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
231
+ key="download-excel",
232
+ )