Spico commited on
Commit
7b40c73
Β·
1 Parent(s): ab44449

- refactor exhausted search API

Browse files

- modify query matching `in_string` to fully text-based matching
- support lemmatization searching (via spacy, time-consuming)

Files changed (9) hide show
  1. .gitignore +1 -0
  2. README.md +7 -4
  3. requirements.txt +2 -1
  4. run.py +1 -1
  5. scripts/get_aclanthology.sh +1 -1
  6. server.py +2 -2
  7. src/engine.py +60 -39
  8. src/interfaces/dblp.py +1 -2
  9. tox.ini +2 -1
.gitignore CHANGED
@@ -131,3 +131,4 @@ dmypy.json
131
  cache/
132
  .coverage
133
  results/
 
 
131
  cache/
132
  .coverage
133
  results/
134
+ debug.py
README.md CHANGED
@@ -20,20 +20,23 @@ A toolkit to help search for papers from aclanthology, arXiv and dblp.
20
  ## 🌴 Setup
21
 
22
  1. Make sure you have [Git](https://git-scm.com/) and [Python](https://www.python.org/downloads/) 3.10.8 installed (or Python >= 3.9).
23
- 2. Install dependencies: `pip install -r requirements.txt`
24
 
25
  ## πŸš€ QuickStart
26
 
27
  Run the example in `run.py`:
28
 
29
  ```bash
30
- $ # clone this repo
31
  $ git clone https://github.com/Spico197/paper-hero.git
32
  $ cd paper-hero
33
- $ # get ready for the acl data, since it is cache-based
 
 
 
34
  $ bash scripts/get_aclanthology.sh
35
  $ python run.py
36
- $ # the results will be saved into `results/`, check them out πŸŽ‰
37
  $ ls results
38
  ```
39
 
 
20
  ## 🌴 Setup
21
 
22
  1. Make sure you have [Git](https://git-scm.com/) and [Python](https://www.python.org/downloads/) 3.10.8 installed (or Python >= 3.9).
23
+ 2. Install dependencies: `pip install -r requirements.txt`, `python -m spacy download en_core_web_sm`
24
 
25
  ## πŸš€ QuickStart
26
 
27
  Run the example in `run.py`:
28
 
29
  ```bash
30
+ # clone this repo
31
  $ git clone https://github.com/Spico197/paper-hero.git
32
  $ cd paper-hero
33
+ # download and install dependencies
34
+ $ pip install -r requirements.txt
35
+ $ python -m spacy download en_core_web_sm
36
+ # get ready for the acl data, since it is cache-based
37
  $ bash scripts/get_aclanthology.sh
38
  $ python run.py
39
+ # the results will be saved into `results/`, check them out πŸŽ‰
40
  $ ls results
41
  ```
42
 
requirements.txt CHANGED
@@ -2,4 +2,5 @@ tqdm>=4.64.1
2
  requests>=2.28.1
3
  feedparser>=6.0.10
4
  fastapi>=0.88.0
5
- uvicorn>=0.20.0
 
 
2
  requests>=2.28.1
3
  feedparser>=6.0.10
4
  fastapi>=0.88.0
5
+ uvicorn>=0.20.0
6
+ spacy>=3.5.0
run.py CHANGED
@@ -43,7 +43,7 @@ if __name__ == "__main__":
43
  "month": [
44
  # the same as the `year` field
45
  ["4", "11"],
46
- ]
47
  }
48
  ee_papers = acl_paper_list.search(ee_query)
49
  dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
 
43
  "month": [
44
  # the same as the `year` field
45
  ["4", "11"],
46
+ ],
47
  }
48
  ee_papers = acl_paper_list.search(ee_query)
49
  dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
scripts/get_aclanthology.sh CHANGED
@@ -1,6 +1,6 @@
1
  set -ex
2
 
3
- mkdir cache
4
  cd cache
5
  if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
6
  git clone https://github.com/acl-org/acl-anthology
 
1
  set -ex
2
 
3
+ mkdir -p cache
4
  cd cache
5
  if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
6
  git clone https://github.com/acl-org/acl-anthology
server.py CHANGED
@@ -1,8 +1,8 @@
1
  import logging
2
  import os
3
- import uuid
4
- import tempfile
5
  import pathlib
 
 
6
 
7
  import uvicorn
8
  from fastapi import FastAPI
 
1
  import logging
2
  import os
 
 
3
  import pathlib
4
+ import tempfile
5
+ import uuid
6
 
7
  import uvicorn
8
  from fastapi import FastAPI
src/engine.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  from src.interfaces import Paper
2
 
3
 
@@ -6,51 +11,58 @@ class SearchAPI:
6
 
7
  def __init__(self) -> None:
8
  self.papers: list[Paper] = []
 
 
 
 
 
9
 
10
- def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]:
 
 
 
 
11
  """Exhausted search papers by matching query"""
12
- def _in_string(statement, string):
13
- stmt_in_string = False
14
- if " " in statement and statement.lower() in string.lower():
15
- stmt_in_string = True
16
- else:
17
- tokens = self.tokenize(string.lower())
18
- if statement.lower() in tokens:
19
- stmt_in_string = True
20
- return stmt_in_string
21
-
22
- papers = self.papers
23
- for field in self.SEARCH_PRIORITY:
24
- if field in query:
25
- req = query[field]
26
- time_spans = []
27
- if field in ["year", "month"]:
28
- for span in req:
29
  assert len(span) == 2
30
  assert all(num.isdigit() for num in span)
31
- time_spans.append((int(span[0]), int(span[1])))
32
 
33
- paper_indices = []
34
- for i, p in enumerate(papers):
35
- matched = False
36
- if time_spans:
37
- if any(s <= p[field] <= e for s, e in time_spans):
 
 
 
 
 
38
  matched = True
39
- else:
40
- if any(
41
- all(
42
- _in_string(stmt, p[field])
43
- for stmt in and_statements
44
- )
45
- for and_statements in req
46
  ):
47
  matched = True
 
 
 
 
 
 
 
48
 
49
- if matched:
50
- paper_indices.append(i)
51
- papers = [papers[i] for i in paper_indices]
52
-
53
- return papers
54
 
55
  def search(
56
  self, query: dict[str, tuple[tuple[str]]], method: str = "exhausted"
@@ -75,7 +87,11 @@ class SearchAPI:
75
  """
76
  papers = []
77
  if method == "exhausted":
78
- papers = self.exhausted_search(query)
 
 
 
 
79
  else:
80
  raise NotImplementedError
81
 
@@ -83,8 +99,13 @@ class SearchAPI:
83
  papers = sorted(set(papers), key=lambda p: (p.year, p.month), reverse=True)
84
  return papers
85
 
86
- def tokenize(self, string: str) -> list[str]:
87
- return string.lower().split()
 
 
 
 
 
88
 
89
  @classmethod
90
  def build_paper_list(cls, *args, **kwargs):
 
1
+ from collections import defaultdict
2
+
3
+ import spacy
4
+ from tqdm import tqdm
5
+
6
  from src.interfaces import Paper
7
 
8
 
 
11
 
12
  def __init__(self) -> None:
13
  self.papers: list[Paper] = []
14
+ self.nlp = None
15
+
16
+ def in_string(self, statement: str, string: str, lemmatization: bool = False):
17
+ _stmt = " ".join(self.tokenize(statement, lemmatization=lemmatization))
18
+ _string = " ".join(self.tokenize(string, lemmatization=lemmatization))
19
 
20
+ return _stmt in _string
21
+
22
+ def exhausted_lemma_search(
23
+ self, query: dict[str, tuple[tuple[str]]], lemmatization: bool = False
24
+ ) -> list[Paper]:
25
  """Exhausted search papers by matching query"""
26
+ results = []
27
+ fields = []
28
+ time_spans = defaultdict(list)
29
+ for f in self.SEARCH_PRIORITY:
30
+ if f in query:
31
+ fields.append(f)
32
+ if f in ["year", "month"]:
33
+ for span in query[f]:
 
 
 
 
 
 
 
 
 
34
  assert len(span) == 2
35
  assert all(num.isdigit() for num in span)
36
+ time_spans[f].append((int(span[0]), int(span[1])))
37
 
38
+ pbar = tqdm(self.papers)
39
+ found = 0
40
+ for p in pbar:
41
+ for f in fields:
42
+ matched = False
43
+ or_statements = query[f]
44
+
45
+ if f in time_spans:
46
+ for s, e in time_spans[f]:
47
+ if s <= p[f] <= e:
48
  matched = True
49
+ break
50
+ else:
51
+ for and_statements in or_statements:
52
+ if all(
53
+ self.in_string(stmt, p[f], lemmatization=lemmatization)
54
+ for stmt in and_statements
 
55
  ):
56
  matched = True
57
+ break
58
+ if not matched:
59
+ break
60
+ else:
61
+ results.append(p)
62
+ found += 1
63
+ pbar.set_postfix({"found": found})
64
 
65
+ return results
 
 
 
 
66
 
67
  def search(
68
  self, query: dict[str, tuple[tuple[str]]], method: str = "exhausted"
 
87
  """
88
  papers = []
89
  if method == "exhausted":
90
+ papers = self.exhausted_lemma_search(query)
91
+ elif method == "exhausted_lemma":
92
+ if self.nlp is None:
93
+ self.nlp = spacy.load("en_core_web_sm")
94
+ papers = self.exhausted_lemma_search(query, lemmatization=True)
95
  else:
96
  raise NotImplementedError
97
 
 
99
  papers = sorted(set(papers), key=lambda p: (p.year, p.month), reverse=True)
100
  return papers
101
 
102
+ def tokenize(self, string: str, lemmatization: bool = False) -> list[str]:
103
+ _string = string.lower()
104
+ if lemmatization:
105
+ doc = self.nlp(_string)
106
+ return [str(t.lemma_) for t in doc]
107
+ else:
108
+ return _string.split()
109
 
110
  @classmethod
111
  def build_paper_list(cls, *args, **kwargs):
src/interfaces/dblp.py CHANGED
@@ -1,8 +1,8 @@
 
1
  import pathlib
2
  import random
3
  import re
4
  import time
5
- import logging
6
 
7
  import requests
8
  from tqdm import trange
@@ -11,7 +11,6 @@ from src.engine import SearchAPI
11
  from src.interfaces import Paper
12
  from src.utils import dump_json, load_json
13
 
14
-
15
  logger = logging.getLogger("uvicorn.default")
16
 
17
 
 
1
+ import logging
2
  import pathlib
3
  import random
4
  import re
5
  import time
 
6
 
7
  import requests
8
  from tqdm import trange
 
11
  from src.interfaces import Paper
12
  from src.utils import dump_json, load_json
13
 
 
14
  logger = logging.getLogger("uvicorn.default")
15
 
16
 
tox.ini CHANGED
@@ -4,4 +4,5 @@ ignore=
4
  E501
5
 
6
  exclude=
7
- cache/
 
 
4
  E501
5
 
6
  exclude=
7
+ cache/,
8
+ debug.py