Spaces:
Sleeping
Sleeping
- refactor exhausted search API
Browse files- modify query matching `in_string` to fully text-based matching
- support lemmatization searching (via spacy, time-consuming)
- .gitignore +1 -0
- README.md +7 -4
- requirements.txt +2 -1
- run.py +1 -1
- scripts/get_aclanthology.sh +1 -1
- server.py +2 -2
- src/engine.py +60 -39
- src/interfaces/dblp.py +1 -2
- tox.ini +2 -1
.gitignore
CHANGED
@@ -131,3 +131,4 @@ dmypy.json
|
|
131 |
cache/
|
132 |
.coverage
|
133 |
results/
|
|
|
|
131 |
cache/
|
132 |
.coverage
|
133 |
results/
|
134 |
+
debug.py
|
README.md
CHANGED
@@ -20,20 +20,23 @@ A toolkit to help search for papers from aclanthology, arXiv and dblp.
|
|
20 |
## π΄ Setup
|
21 |
|
22 |
1. Make sure you have [Git](https://git-scm.com/) and [Python](https://www.python.org/downloads/) 3.10.8 installed (or Python >= 3.9).
|
23 |
-
2. Install dependencies: `pip install -r requirements.txt`
|
24 |
|
25 |
## π QuickStart
|
26 |
|
27 |
Run the example in `run.py`:
|
28 |
|
29 |
```bash
|
30 |
-
|
31 |
$ git clone https://github.com/Spico197/paper-hero.git
|
32 |
$ cd paper-hero
|
33 |
-
|
|
|
|
|
|
|
34 |
$ bash scripts/get_aclanthology.sh
|
35 |
$ python run.py
|
36 |
-
|
37 |
$ ls results
|
38 |
```
|
39 |
|
|
|
20 |
## π΄ Setup
|
21 |
|
22 |
1. Make sure you have [Git](https://git-scm.com/) and [Python](https://www.python.org/downloads/) 3.10.8 installed (or Python >= 3.9).
|
23 |
+
2. Install dependencies: `pip install -r requirements.txt`, `python -m spacy download en_core_web_sm`
|
24 |
|
25 |
## π QuickStart
|
26 |
|
27 |
Run the example in `run.py`:
|
28 |
|
29 |
```bash
|
30 |
+
# clone this repo
|
31 |
$ git clone https://github.com/Spico197/paper-hero.git
|
32 |
$ cd paper-hero
|
33 |
+
# download and install dependencies
|
34 |
+
$ pip install -r requirements.txt
|
35 |
+
$ python -m spacy download en_core_web_sm
|
36 |
+
# get ready for the acl data, since it is cache-based
|
37 |
$ bash scripts/get_aclanthology.sh
|
38 |
$ python run.py
|
39 |
+
# the results will be saved into `results/`, check them out π
|
40 |
$ ls results
|
41 |
```
|
42 |
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ tqdm>=4.64.1
|
|
2 |
requests>=2.28.1
|
3 |
feedparser>=6.0.10
|
4 |
fastapi>=0.88.0
|
5 |
-
uvicorn>=0.20.0
|
|
|
|
2 |
requests>=2.28.1
|
3 |
feedparser>=6.0.10
|
4 |
fastapi>=0.88.0
|
5 |
+
uvicorn>=0.20.0
|
6 |
+
spacy>=3.5.0
|
run.py
CHANGED
@@ -43,7 +43,7 @@ if __name__ == "__main__":
|
|
43 |
"month": [
|
44 |
# the same as the `year` field
|
45 |
["4", "11"],
|
46 |
-
]
|
47 |
}
|
48 |
ee_papers = acl_paper_list.search(ee_query)
|
49 |
dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
|
|
|
43 |
"month": [
|
44 |
# the same as the `year` field
|
45 |
["4", "11"],
|
46 |
+
],
|
47 |
}
|
48 |
ee_papers = acl_paper_list.search(ee_query)
|
49 |
dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
|
scripts/get_aclanthology.sh
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
set -ex
|
2 |
|
3 |
-
mkdir cache
|
4 |
cd cache
|
5 |
if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
|
6 |
git clone https://github.com/acl-org/acl-anthology
|
|
|
1 |
set -ex
|
2 |
|
3 |
+
mkdir -p cache
|
4 |
cd cache
|
5 |
if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
|
6 |
git clone https://github.com/acl-org/acl-anthology
|
server.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
import logging
|
2 |
import os
|
3 |
-
import uuid
|
4 |
-
import tempfile
|
5 |
import pathlib
|
|
|
|
|
6 |
|
7 |
import uvicorn
|
8 |
from fastapi import FastAPI
|
|
|
1 |
import logging
|
2 |
import os
|
|
|
|
|
3 |
import pathlib
|
4 |
+
import tempfile
|
5 |
+
import uuid
|
6 |
|
7 |
import uvicorn
|
8 |
from fastapi import FastAPI
|
src/engine.py
CHANGED
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from src.interfaces import Paper
|
2 |
|
3 |
|
@@ -6,51 +11,58 @@ class SearchAPI:
|
|
6 |
|
7 |
def __init__(self) -> None:
|
8 |
self.papers: list[Paper] = []
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
11 |
"""Exhausted search papers by matching query"""
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
if
|
19 |
-
|
20 |
-
return stmt_in_string
|
21 |
-
|
22 |
-
papers = self.papers
|
23 |
-
for field in self.SEARCH_PRIORITY:
|
24 |
-
if field in query:
|
25 |
-
req = query[field]
|
26 |
-
time_spans = []
|
27 |
-
if field in ["year", "month"]:
|
28 |
-
for span in req:
|
29 |
assert len(span) == 2
|
30 |
assert all(num.isdigit() for num in span)
|
31 |
-
time_spans.append((int(span[0]), int(span[1])))
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
38 |
matched = True
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
for and_statements in req
|
46 |
):
|
47 |
matched = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
|
50 |
-
paper_indices.append(i)
|
51 |
-
papers = [papers[i] for i in paper_indices]
|
52 |
-
|
53 |
-
return papers
|
54 |
|
55 |
def search(
|
56 |
self, query: dict[str, tuple[tuple[str]]], method: str = "exhausted"
|
@@ -75,7 +87,11 @@ class SearchAPI:
|
|
75 |
"""
|
76 |
papers = []
|
77 |
if method == "exhausted":
|
78 |
-
papers = self.
|
|
|
|
|
|
|
|
|
79 |
else:
|
80 |
raise NotImplementedError
|
81 |
|
@@ -83,8 +99,13 @@ class SearchAPI:
|
|
83 |
papers = sorted(set(papers), key=lambda p: (p.year, p.month), reverse=True)
|
84 |
return papers
|
85 |
|
86 |
-
def tokenize(self, string: str) -> list[str]:
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
@classmethod
|
90 |
def build_paper_list(cls, *args, **kwargs):
|
|
|
1 |
+
from collections import defaultdict
|
2 |
+
|
3 |
+
import spacy
|
4 |
+
from tqdm import tqdm
|
5 |
+
|
6 |
from src.interfaces import Paper
|
7 |
|
8 |
|
|
|
11 |
|
12 |
def __init__(self) -> None:
|
13 |
self.papers: list[Paper] = []
|
14 |
+
self.nlp = None
|
15 |
+
|
16 |
+
def in_string(self, statement: str, string: str, lemmatization: bool = False):
|
17 |
+
_stmt = " ".join(self.tokenize(statement, lemmatization=lemmatization))
|
18 |
+
_string = " ".join(self.tokenize(string, lemmatization=lemmatization))
|
19 |
|
20 |
+
return _stmt in _string
|
21 |
+
|
22 |
+
def exhausted_lemma_search(
|
23 |
+
self, query: dict[str, tuple[tuple[str]]], lemmatization: bool = False
|
24 |
+
) -> list[Paper]:
|
25 |
"""Exhausted search papers by matching query"""
|
26 |
+
results = []
|
27 |
+
fields = []
|
28 |
+
time_spans = defaultdict(list)
|
29 |
+
for f in self.SEARCH_PRIORITY:
|
30 |
+
if f in query:
|
31 |
+
fields.append(f)
|
32 |
+
if f in ["year", "month"]:
|
33 |
+
for span in query[f]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
assert len(span) == 2
|
35 |
assert all(num.isdigit() for num in span)
|
36 |
+
time_spans[f].append((int(span[0]), int(span[1])))
|
37 |
|
38 |
+
pbar = tqdm(self.papers)
|
39 |
+
found = 0
|
40 |
+
for p in pbar:
|
41 |
+
for f in fields:
|
42 |
+
matched = False
|
43 |
+
or_statements = query[f]
|
44 |
+
|
45 |
+
if f in time_spans:
|
46 |
+
for s, e in time_spans[f]:
|
47 |
+
if s <= p[f] <= e:
|
48 |
matched = True
|
49 |
+
break
|
50 |
+
else:
|
51 |
+
for and_statements in or_statements:
|
52 |
+
if all(
|
53 |
+
self.in_string(stmt, p[f], lemmatization=lemmatization)
|
54 |
+
for stmt in and_statements
|
|
|
55 |
):
|
56 |
matched = True
|
57 |
+
break
|
58 |
+
if not matched:
|
59 |
+
break
|
60 |
+
else:
|
61 |
+
results.append(p)
|
62 |
+
found += 1
|
63 |
+
pbar.set_postfix({"found": found})
|
64 |
|
65 |
+
return results
|
|
|
|
|
|
|
|
|
66 |
|
67 |
def search(
|
68 |
self, query: dict[str, tuple[tuple[str]]], method: str = "exhausted"
|
|
|
87 |
"""
|
88 |
papers = []
|
89 |
if method == "exhausted":
|
90 |
+
papers = self.exhausted_lemma_search(query)
|
91 |
+
elif method == "exhausted_lemma":
|
92 |
+
if self.nlp is None:
|
93 |
+
self.nlp = spacy.load("en_core_web_sm")
|
94 |
+
papers = self.exhausted_lemma_search(query, lemmatization=True)
|
95 |
else:
|
96 |
raise NotImplementedError
|
97 |
|
|
|
99 |
papers = sorted(set(papers), key=lambda p: (p.year, p.month), reverse=True)
|
100 |
return papers
|
101 |
|
102 |
+
def tokenize(self, string: str, lemmatization: bool = False) -> list[str]:
|
103 |
+
_string = string.lower()
|
104 |
+
if lemmatization:
|
105 |
+
doc = self.nlp(_string)
|
106 |
+
return [str(t.lemma_) for t in doc]
|
107 |
+
else:
|
108 |
+
return _string.split()
|
109 |
|
110 |
@classmethod
|
111 |
def build_paper_list(cls, *args, **kwargs):
|
src/interfaces/dblp.py
CHANGED
@@ -1,8 +1,8 @@
|
|
|
|
1 |
import pathlib
|
2 |
import random
|
3 |
import re
|
4 |
import time
|
5 |
-
import logging
|
6 |
|
7 |
import requests
|
8 |
from tqdm import trange
|
@@ -11,7 +11,6 @@ from src.engine import SearchAPI
|
|
11 |
from src.interfaces import Paper
|
12 |
from src.utils import dump_json, load_json
|
13 |
|
14 |
-
|
15 |
logger = logging.getLogger("uvicorn.default")
|
16 |
|
17 |
|
|
|
1 |
+
import logging
|
2 |
import pathlib
|
3 |
import random
|
4 |
import re
|
5 |
import time
|
|
|
6 |
|
7 |
import requests
|
8 |
from tqdm import trange
|
|
|
11 |
from src.interfaces import Paper
|
12 |
from src.utils import dump_json, load_json
|
13 |
|
|
|
14 |
logger = logging.getLogger("uvicorn.default")
|
15 |
|
16 |
|
tox.ini
CHANGED
@@ -4,4 +4,5 @@ ignore=
|
|
4 |
E501
|
5 |
|
6 |
exclude=
|
7 |
-
cache
|
|
|
|
4 |
E501
|
5 |
|
6 |
exclude=
|
7 |
+
cache/,
|
8 |
+
debug.py
|