mojad121 commited on
Commit
e41a696
·
verified ·
1 Parent(s): 65f5325

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -51
app.py CHANGED
@@ -1,64 +1,137 @@
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
 
26
- messages.append({"role": "user", "content": message})
27
 
28
- response = ""
 
 
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- response += token
40
- yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
 
 
 
 
 
 
 
62
 
63
- if __name__ == "__main__":
64
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ import math
5
+ import nltk
6
+ from collections import defaultdict, Counter
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
9
  import gradio as gr
 
10
 
11
+ nltk.data.path.append("./nltk_data")
 
 
 
12
 
13
+ with open("docs.json", "r", encoding="utf-8") as f:
14
+ docs_ds = json.load(f)
15
 
16
+ with open("queries.json", "r", encoding="utf-8") as f:
17
+ queries_ds = json.load(f)
 
 
 
 
 
 
 
18
 
19
+ documents = {int(doc["doc_id"]): doc["text"] for doc in docs_ds}
20
+ queries = {int(q["query_id"]): q["text"] for q in queries_ds}
 
 
 
21
 
22
+ stop_words = {"a", "is", "the", "of", "all", "and", "to", "can", "be", "as", "once", "for", "at", "am", "are", "has", "have", "had", "up", "his", "her", "in", "on", "no", "we", "do"}
23
 
24
+ inverted_index = defaultdict(set)
25
+ positional_index = defaultdict(lambda: defaultdict(list))
26
+ tf_idf_vectors = defaultdict(dict)
27
+ idf_scores = {}
28
 
29
+ def process_documents(documents):
30
+ stemmer = PorterStemmer()
31
+ lemmatizer = WordNetLemmatizer()
32
+ doc_freq = defaultdict(int)
33
+ term_freqs = {}
34
+ for doc_id, text in documents.items():
35
+ words = word_tokenize(text.lower())
36
+ filtered_words = [lemmatizer.lemmatize(w) for w in words if w.isalnum() and w not in stop_words]
37
+ term_counts = Counter(filtered_words)
38
+ term_freqs[doc_id] = term_counts
39
+ for pos, word in enumerate(filtered_words):
40
+ stemmed = stemmer.stem(word)
41
+ inverted_index[stemmed].add(doc_id)
42
+ positional_index[stemmed][doc_id].append(pos)
43
+ for word in set(filtered_words):
44
+ doc_freq[word] += 1
45
+ total_docs = len(documents)
46
+ for word, df in doc_freq.items():
47
+ idf_scores[word] = math.log(total_docs / df)
48
+ for doc_id, term_counts in term_freqs.items():
49
+ tf_idf_vectors[doc_id] = {word: count * idf_scores[word] for word, count in term_counts.items()}
50
 
51
+ def execute_boolean_query(query, documents):
52
+ query = query.lower()
53
+ tokens = query.split()
54
+ stemmer = PorterStemmer()
55
+ operators = {'and', 'or', 'not'}
56
+ term_stack = []
57
+ operator_stack = []
58
+ for token in tokens:
59
+ if token in operators:
60
+ operator_stack.append(token)
61
+ else:
62
+ stemmed_word = stemmer.stem(token)
63
+ term_set = inverted_index.get(stemmed_word, set())
64
+ term_stack.append(term_set)
65
+ while 'not' in operator_stack:
66
+ idx = operator_stack.index('not')
67
+ term_stack[idx] = set(documents.keys()) - term_stack[idx]
68
+ operator_stack.pop(idx)
69
+ while operator_stack:
70
+ op = operator_stack.pop(0)
71
+ left = term_stack.pop(0)
72
+ right = term_stack.pop(0)
73
+ if op == 'and':
74
+ term_stack.insert(0, left & right)
75
+ elif op == 'or':
76
+ term_stack.insert(0, left | right)
77
+ return sorted(term_stack[0]) if term_stack else []
78
 
79
+ def execute_proximity_query(query):
80
+ match = re.match(r'(\w+)\s+(\w+)\s*/\s*(\d+)', query)
81
+ if not match:
82
+ return []
83
+ word1, word2, k = match.groups()
84
+ k = int(k)
85
+ stemmer = PorterStemmer()
86
+ word1 = stemmer.stem(word1.lower())
87
+ word2 = stemmer.stem(word2.lower())
88
+ result_docs = set()
89
+ if word1 in positional_index and word2 in positional_index:
90
+ for doc_id in positional_index[word1]:
91
+ if doc_id in positional_index[word2]:
92
+ positions1 = positional_index[word1][doc_id]
93
+ positions2 = positional_index[word2][doc_id]
94
+ if any(0 < abs(p1 - p2) <= k for p1 in positions1 for p2 in positions2):
95
+ result_docs.add(doc_id)
96
+ return sorted(result_docs)
97
 
98
+ def evaluate_cosine_similarity_score(vec1, vec2):
99
+ common = set(vec1.keys()) & set(vec2.keys())
100
+ dot_product = sum(vec1[k] * vec2[k] for k in common)
101
+ norm1 = math.sqrt(sum(v**2 for v in vec1.values()))
102
+ norm2 = math.sqrt(sum(v**2 for v in vec2.values()))
103
+ if norm1 == 0 or norm2 == 0:
104
+ return 0.0
105
+ return dot_product / (norm1 * norm2)
 
 
 
 
 
 
 
 
 
 
106
 
107
+ def process_query(user_input_query):
108
+ lemmatizer = WordNetLemmatizer()
109
+ tokens = word_tokenize(user_input_query.lower())
110
+ filtered = [lemmatizer.lemmatize(w) for w in tokens if w.isalnum() and w not in stop_words]
111
+ query_counts = Counter(filtered)
112
+ return {w: query_counts[w] * idf_scores.get(w, 0) for w in query_counts}
113
 
114
+ def execute_vsm_query(user_input_query, alpha=0.001):
115
+ query_vector = process_query(user_input_query)
116
+ scores = {}
117
+ for doc_id, doc_vector in tf_idf_vectors.items():
118
+ sim = evaluate_cosine_similarity_score(query_vector, doc_vector)
119
+ if sim >= alpha:
120
+ scores[doc_id] = sim
121
+ return sorted(scores, key=scores.get, reverse=True)
122
+
123
+ def chat(query, method):
124
+ if not query:
125
+ return "Query cannot be empty"
126
+ if method == "Boolean":
127
+ result = execute_boolean_query(query, documents)
128
+ elif method == "Proximity":
129
+ result = execute_proximity_query(query)
130
+ else:
131
+ result = execute_vsm_query(query)
132
+ return f"Result-set: {result}"
133
+
134
+ process_documents(documents)
135
+
136
+ demo = gr.Interface(fn=chat, inputs=["text", gr.Radio(["Boolean", "Proximity", "Vector Space Model"], label="Model")], outputs="text")
137
+ demo.launch()