Spaces:

Spico
/

paper-hero

Sleeping

Spico commited on Mar 4, 2023

Commit

7b40c73

1 Parent(s): ab44449

- refactor exhausted search API

- modify query matching `in_string` to fully text-based matching
- support lemmatization searching (via spacy, time-consuming)

Files changed (9) hide show

.gitignore +1 -0
README.md +7 -4
requirements.txt +2 -1
run.py +1 -1
scripts/get_aclanthology.sh +1 -1
server.py +2 -2
src/engine.py +60 -39
src/interfaces/dblp.py +1 -2
tox.ini +2 -1

.gitignore CHANGED Viewed

@@ -131,3 +131,4 @@ dmypy.json
 cache/
 .coverage
 results/

 cache/
 .coverage
 results/
+debug.py

README.md CHANGED Viewed

@@ -20,20 +20,23 @@ A toolkit to help search for papers from aclanthology, arXiv and dblp.
 ## 🌴 Setup
 1. Make sure you have [Git](https://git-scm.com/) and [Python](https://www.python.org/downloads/) 3.10.8 installed (or Python >= 3.9).
-2. Install dependencies: `pip install -r requirements.txt`
 ## 🚀 QuickStart
 Run the example in `run.py`:
 ```bash
-$ # clone this repo
 $ git clone https://github.com/Spico197/paper-hero.git
 $ cd paper-hero
-$ # get ready for the acl data, since it is cache-based
 $ bash scripts/get_aclanthology.sh
 $ python run.py
-$ # the results will be saved into `results/`, check them out 🎉
 $ ls results
 ```

 ## 🌴 Setup
 1. Make sure you have [Git](https://git-scm.com/) and [Python](https://www.python.org/downloads/) 3.10.8 installed (or Python >= 3.9).
+2. Install dependencies: `pip install -r requirements.txt`, `python -m spacy download en_core_web_sm`
 ## 🚀 QuickStart
 Run the example in `run.py`:
 ```bash
+# clone this repo
 $ git clone https://github.com/Spico197/paper-hero.git
 $ cd paper-hero
+# download and install dependencies
+$ pip install -r requirements.txt
+$ python -m spacy download en_core_web_sm
+# get ready for the acl data, since it is cache-based
 $ bash scripts/get_aclanthology.sh
 $ python run.py
+# the results will be saved into `results/`, check them out 🎉
 $ ls results
 ```

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ tqdm>=4.64.1
 requests>=2.28.1
 feedparser>=6.0.10
 fastapi>=0.88.0
-uvicorn>=0.20.0

 requests>=2.28.1
 feedparser>=6.0.10
 fastapi>=0.88.0
+uvicorn>=0.20.0
+spacy>=3.5.0

run.py CHANGED Viewed

@@ -43,7 +43,7 @@ if __name__ == "__main__":
         "month": [
             # the same as the `year` field
             ["4", "11"],
-        ]
     }
     ee_papers = acl_paper_list.search(ee_query)
     dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")

         "month": [
             # the same as the `year` field
             ["4", "11"],
+        ],
     }
     ee_papers = acl_paper_list.search(ee_query)
     dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")

scripts/get_aclanthology.sh CHANGED Viewed

@@ -1,6 +1,6 @@
 set -ex
-mkdir cache
 cd cache
 if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
     git clone https://github.com/acl-org/acl-anthology

 set -ex
+mkdir -p cache
 cd cache
 if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
     git clone https://github.com/acl-org/acl-anthology

server.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import logging
 import os
-import uuid
-import tempfile
 import pathlib
 import uvicorn
 from fastapi import FastAPI

 import logging
 import os
 import pathlib
+import tempfile
+import uuid
 import uvicorn
 from fastapi import FastAPI

src/engine.py CHANGED Viewed

@@ -1,3 +1,8 @@
 from src.interfaces import Paper
@@ -6,51 +11,58 @@ class SearchAPI:
     def __init__(self) -> None:
         self.papers: list[Paper] = []
-    def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]:
         """Exhausted search papers by matching query"""
-        def _in_string(statement, string):
-            stmt_in_string = False
-            if " " in statement and statement.lower() in string.lower():
-                stmt_in_string = True
-            else:
-                tokens = self.tokenize(string.lower())
-                if statement.lower() in tokens:
-                    stmt_in_string = True
-            return stmt_in_string
-        papers = self.papers
-        for field in self.SEARCH_PRIORITY:
-            if field in query:
-                req = query[field]
-                time_spans = []
-                if field in ["year", "month"]:
-                    for span in req:
                         assert len(span) == 2
                         assert all(num.isdigit() for num in span)
-                        time_spans.append((int(span[0]), int(span[1])))
-                paper_indices = []
-                for i, p in enumerate(papers):
-                    matched = False
-                    if time_spans:
-                        if any(s <= p[field] <= e for s, e in time_spans):
                             matched = True
-                    else:
-                        if any(
-                            all(
-                                _in_string(stmt, p[field])
-                                for stmt in and_statements
-                            )
-                            for and_statements in req
                         ):
                             matched = True
-                    if matched:
-                        paper_indices.append(i)
-                papers = [papers[i] for i in paper_indices]
-        return papers
     def search(
         self, query: dict[str, tuple[tuple[str]]], method: str = "exhausted"
@@ -75,7 +87,11 @@ class SearchAPI:
         """
         papers = []
         if method == "exhausted":
-            papers = self.exhausted_search(query)
         else:
             raise NotImplementedError
@@ -83,8 +99,13 @@ class SearchAPI:
             papers = sorted(set(papers), key=lambda p: (p.year, p.month), reverse=True)
         return papers
-    def tokenize(self, string: str) -> list[str]:
-        return string.lower().split()
     @classmethod
     def build_paper_list(cls, *args, **kwargs):

+from collections import defaultdict
+import spacy
+from tqdm import tqdm
 from src.interfaces import Paper
     def __init__(self) -> None:
         self.papers: list[Paper] = []
+        self.nlp = None
+    def in_string(self, statement: str, string: str, lemmatization: bool = False):
+        _stmt = " ".join(self.tokenize(statement, lemmatization=lemmatization))
+        _string = " ".join(self.tokenize(string, lemmatization=lemmatization))
+        return _stmt in _string
+    def exhausted_lemma_search(
+        self, query: dict[str, tuple[tuple[str]]], lemmatization: bool = False
+    ) -> list[Paper]:
         """Exhausted search papers by matching query"""
+        results = []
+        fields = []
+        time_spans = defaultdict(list)
+        for f in self.SEARCH_PRIORITY:
+            if f in query:
+                fields.append(f)
+                if f in ["year", "month"]:
+                    for span in query[f]:
                         assert len(span) == 2
                         assert all(num.isdigit() for num in span)
+                        time_spans[f].append((int(span[0]), int(span[1])))
+        pbar = tqdm(self.papers)
+        found = 0
+        for p in pbar:
+            for f in fields:
+                matched = False
+                or_statements = query[f]
+                if f in time_spans:
+                    for s, e in time_spans[f]:
+                        if s <= p[f] <= e:
                             matched = True
+                            break
+                else:
+                    for and_statements in or_statements:
+                        if all(
+                            self.in_string(stmt, p[f], lemmatization=lemmatization)
+                            for stmt in and_statements
                         ):
                             matched = True
+                            break
+                if not matched:
+                    break
+            else:
+                results.append(p)
+                found += 1
+                pbar.set_postfix({"found": found})
+        return results
     def search(
         self, query: dict[str, tuple[tuple[str]]], method: str = "exhausted"
         """
         papers = []
         if method == "exhausted":
+            papers = self.exhausted_lemma_search(query)
+        elif method == "exhausted_lemma":
+            if self.nlp is None:
+                self.nlp = spacy.load("en_core_web_sm")
+            papers = self.exhausted_lemma_search(query, lemmatization=True)
         else:
             raise NotImplementedError
             papers = sorted(set(papers), key=lambda p: (p.year, p.month), reverse=True)
         return papers
+    def tokenize(self, string: str, lemmatization: bool = False) -> list[str]:
+        _string = string.lower()
+        if lemmatization:
+            doc = self.nlp(_string)
+            return [str(t.lemma_) for t in doc]
+        else:
+            return _string.split()
     @classmethod
     def build_paper_list(cls, *args, **kwargs):

src/interfaces/dblp.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import pathlib
 import random
 import re
 import time
-import logging
 import requests
 from tqdm import trange
@@ -11,7 +11,6 @@ from src.engine import SearchAPI
 from src.interfaces import Paper
 from src.utils import dump_json, load_json
 logger = logging.getLogger("uvicorn.default")

+import logging
 import pathlib
 import random
 import re
 import time
 import requests
 from tqdm import trange
 from src.interfaces import Paper
 from src.utils import dump_json, load_json
 logger = logging.getLogger("uvicorn.default")

tox.ini CHANGED Viewed

@@ -4,4 +4,5 @@ ignore=
     E501
 exclude=
-    cache/

     E501
 exclude=
+    cache/,
+    debug.py