abdulllah01 commited on
Commit
e8d4fbe
·
verified ·
1 Parent(s): 416370a

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +106 -0
main.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from fuzzywuzzy import fuzz
4
+ from fuzzywuzzy import process
5
+ import string
6
+ from nltk.corpus import stopwords
7
+ import nltk
8
+
9
+ nltk.download('stopwords')
10
+
11
+ # Helper functions
12
+ def clean_text(text):
13
+ text = text.lower()
14
+ text = text.translate(str.maketrans('', '', string.punctuation))
15
+ return text
16
+
17
+ def clean_text_fuzzy(text):
18
+ stop_words = set(stopwords.words('english'))
19
+ text = text.lower()
20
+ text = text.translate(str.maketrans('', '', string.punctuation))
21
+ words = text.split()
22
+ words = [word for word in words if word not in stop_words]
23
+ return " ".join(words)
24
+
25
+ def process_matching(keywords, article, fuzzy, mode):
26
+ keywords = [clean_text(k) for k in keywords]
27
+
28
+ article_exact = clean_text(article)
29
+ article_fuzzy = clean_text_fuzzy(article)
30
+
31
+ results = {}
32
+ max_keyword_length = max(len(k.split()) for k in keywords)
33
+
34
+ for n in range(1, max_keyword_length + 1):
35
+ n_grams_exact = [" ".join(article_exact.split()[i:i + n]) for i in range(len(article_exact.split()) - n + 1)]
36
+ n_grams_fuzzy = [" ".join(article_fuzzy.split()[i:i + n]) for i in range(len(article_fuzzy.split()) - n + 1)]
37
+
38
+ for keyword in keywords:
39
+ if fuzzy:
40
+ matches = process.extract(keyword, n_grams_fuzzy, scorer=fuzz.partial_ratio, limit=None)
41
+ if keyword not in results:
42
+ results[keyword] = 0
43
+ results[keyword] += sum(1 for match, score in matches if score > 90)
44
+ else:
45
+ if keyword not in results:
46
+ results[keyword] = 0
47
+ results[keyword] += n_grams_exact.count(keyword)
48
+
49
+ if mode == "filter":
50
+ results = {k: v for k, v in results.items() if v > 0}
51
+
52
+ return results
53
+
54
+ # Streamlit app
55
+ st.title("Keyword Matcher")
56
+
57
+ # Mode selection
58
+ mode = st.radio("Select Mode:", ["Keyword Frequency", "Keyword Filter"], horizontal=True)
59
+ mode = "frequency" if mode == "Keyword Frequency" else "filter"
60
+
61
+ # Keyword input
62
+ st.subheader("Keywords")
63
+ keywords_input = st.text_area("Enter keywords (comma separated):")
64
+ uploaded_file = st.file_uploader("Or upload a CSV/Excel file with keywords (first column):", type=["csv", "xlsx"])
65
+
66
+ keywords = []
67
+ if uploaded_file:
68
+ if uploaded_file.name.endswith(".csv"):
69
+ df = pd.read_csv(uploaded_file)
70
+ else:
71
+ df = pd.read_excel(uploaded_file)
72
+ if not df.empty:
73
+ keywords = df.iloc[:, 0].dropna().tolist()
74
+ else:
75
+ keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]
76
+
77
+ # Article input
78
+ st.subheader("Article")
79
+ article = st.text_area("Paste the article text here:")
80
+
81
+ # Fuzzy matching checkbox
82
+ fuzzy = st.checkbox("Enable Fuzzy Matching")
83
+
84
+ # Process button
85
+ if st.button("Process"):
86
+ if not keywords:
87
+ st.error("Please provide keywords.")
88
+ elif not article:
89
+ st.error("Please provide an article.")
90
+ else:
91
+ results = process_matching(keywords, article, fuzzy, mode)
92
+
93
+ st.subheader("Results")
94
+ for keyword, count in results.items():
95
+ st.write(f"{keyword}: {count}")
96
+
97
+ # Save to Excel
98
+ save_results = st.checkbox("Save results to Excel")
99
+ if save_results:
100
+ df = pd.DataFrame(list(results.items()), columns=["Keyword", "Count"])
101
+ st.download_button(
102
+ label="Download Results as Excel",
103
+ data=df.to_excel(index=False, engine='openpyxl'),
104
+ file_name="results.xlsx",
105
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
106
+ )