Spaces:
Sleeping
Sleeping
Create main.py
Browse files
main.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from fuzzywuzzy import fuzz
|
4 |
+
from fuzzywuzzy import process
|
5 |
+
import string
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
import nltk
|
8 |
+
|
9 |
+
nltk.download('stopwords')
|
10 |
+
|
11 |
+
# Helper functions
|
12 |
+
def clean_text(text):
|
13 |
+
text = text.lower()
|
14 |
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
15 |
+
return text
|
16 |
+
|
17 |
+
def clean_text_fuzzy(text):
|
18 |
+
stop_words = set(stopwords.words('english'))
|
19 |
+
text = text.lower()
|
20 |
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
21 |
+
words = text.split()
|
22 |
+
words = [word for word in words if word not in stop_words]
|
23 |
+
return " ".join(words)
|
24 |
+
|
25 |
+
def process_matching(keywords, article, fuzzy, mode):
|
26 |
+
keywords = [clean_text(k) for k in keywords]
|
27 |
+
|
28 |
+
article_exact = clean_text(article)
|
29 |
+
article_fuzzy = clean_text_fuzzy(article)
|
30 |
+
|
31 |
+
results = {}
|
32 |
+
max_keyword_length = max(len(k.split()) for k in keywords)
|
33 |
+
|
34 |
+
for n in range(1, max_keyword_length + 1):
|
35 |
+
n_grams_exact = [" ".join(article_exact.split()[i:i + n]) for i in range(len(article_exact.split()) - n + 1)]
|
36 |
+
n_grams_fuzzy = [" ".join(article_fuzzy.split()[i:i + n]) for i in range(len(article_fuzzy.split()) - n + 1)]
|
37 |
+
|
38 |
+
for keyword in keywords:
|
39 |
+
if fuzzy:
|
40 |
+
matches = process.extract(keyword, n_grams_fuzzy, scorer=fuzz.partial_ratio, limit=None)
|
41 |
+
if keyword not in results:
|
42 |
+
results[keyword] = 0
|
43 |
+
results[keyword] += sum(1 for match, score in matches if score > 90)
|
44 |
+
else:
|
45 |
+
if keyword not in results:
|
46 |
+
results[keyword] = 0
|
47 |
+
results[keyword] += n_grams_exact.count(keyword)
|
48 |
+
|
49 |
+
if mode == "filter":
|
50 |
+
results = {k: v for k, v in results.items() if v > 0}
|
51 |
+
|
52 |
+
return results
|
53 |
+
|
54 |
+
# Streamlit app
|
55 |
+
st.title("Keyword Matcher")
|
56 |
+
|
57 |
+
# Mode selection
|
58 |
+
mode = st.radio("Select Mode:", ["Keyword Frequency", "Keyword Filter"], horizontal=True)
|
59 |
+
mode = "frequency" if mode == "Keyword Frequency" else "filter"
|
60 |
+
|
61 |
+
# Keyword input
|
62 |
+
st.subheader("Keywords")
|
63 |
+
keywords_input = st.text_area("Enter keywords (comma separated):")
|
64 |
+
uploaded_file = st.file_uploader("Or upload a CSV/Excel file with keywords (first column):", type=["csv", "xlsx"])
|
65 |
+
|
66 |
+
keywords = []
|
67 |
+
if uploaded_file:
|
68 |
+
if uploaded_file.name.endswith(".csv"):
|
69 |
+
df = pd.read_csv(uploaded_file)
|
70 |
+
else:
|
71 |
+
df = pd.read_excel(uploaded_file)
|
72 |
+
if not df.empty:
|
73 |
+
keywords = df.iloc[:, 0].dropna().tolist()
|
74 |
+
else:
|
75 |
+
keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]
|
76 |
+
|
77 |
+
# Article input
|
78 |
+
st.subheader("Article")
|
79 |
+
article = st.text_area("Paste the article text here:")
|
80 |
+
|
81 |
+
# Fuzzy matching checkbox
|
82 |
+
fuzzy = st.checkbox("Enable Fuzzy Matching")
|
83 |
+
|
84 |
+
# Process button
|
85 |
+
if st.button("Process"):
|
86 |
+
if not keywords:
|
87 |
+
st.error("Please provide keywords.")
|
88 |
+
elif not article:
|
89 |
+
st.error("Please provide an article.")
|
90 |
+
else:
|
91 |
+
results = process_matching(keywords, article, fuzzy, mode)
|
92 |
+
|
93 |
+
st.subheader("Results")
|
94 |
+
for keyword, count in results.items():
|
95 |
+
st.write(f"{keyword}: {count}")
|
96 |
+
|
97 |
+
# Save to Excel
|
98 |
+
save_results = st.checkbox("Save results to Excel")
|
99 |
+
if save_results:
|
100 |
+
df = pd.DataFrame(list(results.items()), columns=["Keyword", "Count"])
|
101 |
+
st.download_button(
|
102 |
+
label="Download Results as Excel",
|
103 |
+
data=df.to_excel(index=False, engine='openpyxl'),
|
104 |
+
file_name="results.xlsx",
|
105 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
106 |
+
)
|