Jonas Leeb commited on
Commit
ba1724a
·
1 Parent(s): 15c05d2
Files changed (2) hide show
  1. app.py +105 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import gradio as gr
3
+ from scipy.sparse import load_npz
4
+ import numpy as np
5
+ import json
6
+ from datasets import load_dataset
7
+
8
+ # --- Load data and embeddings ---
9
+ with open("feature_names.txt", "r") as f:
10
+ feature_names = [line.strip() for line in f]
11
+
12
+ tfidf_matrix = load_npz("tfidf_matrix_train.npz")
13
+
14
+ # Load dataset and initialize search engine
15
+ dataset = load_dataset("ccdv/arxiv-classification", "no_ref") # replace with your dataset
16
+
17
+ documents = []
18
+ titles = []
19
+ arxiv_ids = []
20
+
21
+ for item in dataset["train"]:
22
+ text = item["text"]
23
+ if not text or len(text.strip()) < 10:
24
+ continue
25
+
26
+ lines = text.splitlines()
27
+ title_lines = []
28
+ found_arxiv = False
29
+ arxiv_id = None
30
+
31
+ for line in lines:
32
+ line_strip = line.strip()
33
+ if not found_arxiv and line_strip.lower().startswith("arxiv:"):
34
+ found_arxiv = True
35
+ match = re.search(r'arxiv:\d{4}\.\d{4,5}v\d', line_strip, flags=re.IGNORECASE)
36
+ if match:
37
+ arxiv_id = match.group(0).lower()
38
+ elif not found_arxiv:
39
+ title_lines.append(line_strip)
40
+ else:
41
+ if line_strip.lower().startswith("abstract"):
42
+ break
43
+
44
+ title = " ".join(title_lines).strip()
45
+ documents.append(text.strip())
46
+ titles.append(title)
47
+ arxiv_ids.append(arxiv_id)
48
+
49
+
50
+ def keyword_match_ranking(query, top_n=5):
51
+ query_terms = query.lower().split()
52
+ query_indices = [i for i, term in enumerate(feature_names) if term in query_terms]
53
+ if not query_indices:
54
+ return []
55
+
56
+ scores = []
57
+ for doc_idx in range(tfidf_matrix.shape[0]):
58
+ doc_vector = tfidf_matrix[doc_idx]
59
+ doc_score = sum(doc_vector[0, i] for i in query_indices)
60
+ if doc_score > 0:
61
+ scores.append((doc_idx, doc_score))
62
+
63
+ scores.sort(key=lambda x: x[1], reverse=True)
64
+ return scores[:top_n]
65
+
66
+
67
+ def snippet_before_abstract(text):
68
+ pattern = re.compile(r'a\s*b\s*s\s*t\s*r\s*a\s*c\s*t|i\s*n\s*t\s*r\s*o\s*d\s*u\s*c\s*t\s*i\s*o\s*n', re.IGNORECASE)
69
+ match = pattern.search(text)
70
+ if match:
71
+ return text[:match.start()].strip()
72
+ else:
73
+ return text[:100].strip()
74
+
75
+
76
+ def search_function(query):
77
+ results = keyword_match_ranking(query)
78
+ if not results:
79
+ return "No results found."
80
+
81
+ output = ""
82
+ display_rank = 1
83
+ for idx, score in results:
84
+ if not arxiv_ids[idx]:
85
+ continue
86
+
87
+ link = f"https://arxiv.org/abs/{arxiv_ids[idx].replace('arxiv:', '')}"
88
+ snippet = snippet_before_abstract(documents[idx]).replace('\n', '<br>')
89
+ output += f"### Document {display_rank}\n"
90
+ output += f"[arXiv Link]({link})\n\n"
91
+ output += f"<pre>{snippet}</pre>\n\n---\n"
92
+ display_rank += 1
93
+
94
+ return output
95
+
96
+
97
+ iface = gr.Interface(
98
+ fn=search_function,
99
+ inputs=gr.Textbox(lines=1, placeholder="Enter your search query"),
100
+ outputs=gr.Markdown(),
101
+ title="arXiv Search Engine",
102
+ description="Search TF-IDF encoded arXiv papers by keyword.",
103
+ )
104
+
105
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ scipy
3
+ numpy
4
+ datasets