zakihassan04 commited on
Commit
626a5ef
·
verified ·
1 Parent(s): 972c23f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -0
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import torch
4
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
5
+ from sentence_transformers import SentenceTransformer, util
6
+
7
+ # Load dataset
8
+ with open("data/gpt2_ready_filtered.jsonl", "r", encoding="utf-8") as f:
9
+ data = [json.loads(line) for line in f]
10
+
11
+ texts = [item["text"] for item in data]
12
+
13
+ # SomaliQA class
14
+ class SomaliQA:
15
+ def __init__(self, dataset_texts):
16
+ self.texts = dataset_texts
17
+ self.embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
18
+ self.embeddings = self.embedder.encode(self.texts, convert_to_tensor=True)
19
+ self.tokenizer = GPT2Tokenizer.from_pretrained("zakihassan04/gpt2-finetuned-somali")
20
+ self.model = GPT2LMHeadModel.from_pretrained("zakihassan04/gpt2-finetuned-somali")
21
+ self.tokenizer.pad_token = self.tokenizer.eos_token
22
+
23
+ def extract_qa(self, text):
24
+ parts = text.split("\nJawaab:")
25
+ if len(parts) == 2:
26
+ return parts[0].replace("Su'aal:", "").strip(), parts[1].strip()
27
+ return None, None
28
+
29
+ def answer(self, user_question):
30
+ if not user_question.strip().endswith("?"):
31
+ user_question += "?"
32
+
33
+ cleaned_question = user_question.strip().rstrip("?")
34
+
35
+ # Step 1: Exact match
36
+ for text in self.texts:
37
+ su_aal, jawaab = self.extract_qa(text)
38
+ if su_aal and cleaned_question.lower() == su_aal.lower():
39
+ return jawaab # ✅ Return exact answer from dataset
40
+
41
+ # Step 2: Semantic match
42
+ user_emb = self.embedder.encode(cleaned_question, convert_to_tensor=True)
43
+ hits = util.semantic_search(user_emb, self.embeddings, top_k=1)
44
+ if hits and len(hits[0]) > 0:
45
+ idx = hits[0][0]['corpus_id']
46
+ su_aal, jawaab = self.extract_qa(self.texts[idx])
47
+ return jawaab # ✅ Return answer from dataset (not generated)
48
+
49
+ return "Ma helin jawaab ku habboon su’aashaada."
50
+
51
+ # Init model
52
+ qa_system = SomaliQA(texts)
53
+
54
+ # Gradio UI
55
+ def qa_interface(question):
56
+ return qa_system.answer(question)
57
+
58
+ # Gradio interface
59
+ gr.Interface(
60
+ fn=qa_interface,
61
+ inputs="text",
62
+ outputs="text",
63
+ title="Somali QA Chatbot (Dataset-based)",
64
+ description="Weydii su’aal la xiriirta beeralayda — jawaabta waxa laga soo saaraa dataset-kaaga (GPT2 fine-tuned).",
65
+ theme="compact"
66
+ ).launch()