Spaces:

yipengsun
/

AutoCitation

Sleeping

App Files Files Community

yipengsun commited on Feb 8

Commit

05d713a

verified ·

1 Parent(s): f1fefc1

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -55

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import urllib.parse
 import re
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass, field
-from typing import Dict, List, Optional
 import sys
 from loguru import logger
@@ -12,13 +12,23 @@ import aiohttp
 import gradio as gr
 from langchain.prompts import PromptTemplate
 from langchain_google_genai import ChatGoogleGenerativeAI
 import bibtexparser
 from bibtexparser.bwriter import BibTexWriter
 from bibtexparser.bibdatabase import BibDatabase
 @dataclass
 class Config:
     gemini_api_key: str
@@ -28,18 +38,25 @@ class Config:
     max_citations_per_query: int = 10
     arxiv_base_url: str = 'http://export.arxiv.org/api/query?'
     crossref_base_url: str = 'https://api.crossref.org/works'
-    default_headers: dict = field(default_factory=lambda: {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
     })
     log_level: str = 'DEBUG'
 class ArxivXmlParser:
     NS = {
         'atom': 'http://www.w3.org/2005/Atom',
         'arxiv': 'http://arxiv.org/schemas/atom'
     }
-    def parse_papers(self, data: str) -> List[Dict]:
         try:
             root = ET.fromstring(data)
             papers = []
@@ -52,12 +69,15 @@ class ArxivXmlParser:
             logger.error(f"Error parsing ArXiv XML: {e}")
             return []
-    def parse_entry(self, entry) -> Optional[dict]:
         try:
             title_node = entry.find('atom:title', self.NS)
             if title_node is None:
                 return None
-            title = title_node.text.strip()
             authors = []
             for author in entry.findall('atom:author', self.NS):
@@ -66,15 +86,15 @@ class ArxivXmlParser:
                     authors.append(self._format_author_name(author_name_node.text.strip()))
             arxiv_id_node = entry.find('atom:id', self.NS)
-            if arxiv_id_node is None:
                 return None
             arxiv_id = arxiv_id_node.text.split('/')[-1]
             published_node = entry.find('atom:published', self.NS)
-            year = published_node.text[:4] if published_node is not None else "Unknown"
             abstract_node = entry.find('atom:summary', self.NS)
-            abstract = abstract_node.text.strip() if abstract_node is not None else ""
             bibtex_key = f"{authors[0].split(',')[0]}{arxiv_id.replace('.', '')}" if authors else f"unknown{arxiv_id.replace('.', '')}"
             bibtex_entry = self._generate_bibtex_entry(bibtex_key, title, authors, arxiv_id, year)
@@ -94,12 +114,18 @@ class ArxivXmlParser:
     @staticmethod
     def _format_author_name(author: str) -> str:
         names = author.split()
         if len(names) > 1:
             return f"{names[-1]}, {' '.join(names[:-1])}"
         return author
     def _generate_bibtex_entry(self, key: str, title: str, authors: List[str], arxiv_id: str, year: str) -> str:
         db = BibDatabase()
         db.entries = [{
             'ENTRYTYPE': 'article',
@@ -109,13 +135,15 @@ class ArxivXmlParser:
             'journal': f'arXiv preprint arXiv:{arxiv_id}',
             'year': year
         }]
-        writer = BibTexWriter()
-        writer.indent = '    '
-        writer.comma_first = False
         return writer.write(db).strip()
 class AsyncContextManager:
-    async def __aenter__(self):
         self._session = aiohttp.ClientSession()
         return self._session
@@ -123,13 +151,17 @@ class AsyncContextManager:
         if self._session:
             await self._session.close()
 class CitationGenerator:
-    def __init__(self, config: Config):
         self.config = config
         self.xml_parser = ArxivXmlParser()
         self.async_context = AsyncContextManager()
         self.llm = ChatGoogleGenerativeAI(
-            model="gemini-2.0-flash-exp",
             temperature=0.3,
             google_api_key=config.gemini_api_key,
             streaming=True
@@ -165,6 +197,9 @@ class CitationGenerator:
         logger.add(sys.stderr, level=config.log_level)
     async def generate_queries(self, text: str, num_queries: int) -> List[str]:
         input_map = {
             "text": text,
             "num_queries": num_queries
@@ -186,14 +221,15 @@ class CitationGenerator:
                 lines = [line.strip() for line in content.split('\n')
                         if line.strip() and not line.strip().startswith(('[', ']'))]
                 return lines[:num_queries]
             return ["deep learning neural networks"]
         except Exception as e:
             logger.error(f"Error generating queries: {e}")
             return ["deep learning neural networks"]
-    async def search_arxiv(self, session: aiohttp.ClientSession, query: str, max_results: int) -> List[Dict]:
         try:
             params = {
                 'search_query': f'all:{urllib.parse.quote(query)}',
@@ -202,8 +238,9 @@ class CitationGenerator:
                 'sortBy': 'relevance',
                 'sortOrder': 'descending'
             }
             async with session.get(
-                self.config.arxiv_base_url + urllib.parse.urlencode(params),
                 headers=self.config.default_headers,
                 timeout=30
             ) as response:
@@ -215,20 +252,23 @@ class CitationGenerator:
             return []
     async def fix_author_name(self, author: str) -> str:
         if not re.search(r'[�]', author):
             return author
         try:
             prompt = f"""Fix this author name that contains corrupted characters (�):
-                    Name: {author}
-                    Requirements:
-                    1. Return ONLY the fixed author name
-                    2. Use proper diacritical marks for names
-                    3. Consider common name patterns and languages
-                    4. If unsure, use the most likely letter
-                    5. Maintain the format: "Lastname, Firstname"
-                    """
             response = await self.llm.ainvoke(prompt)
             fixed_name = response.content.strip()
             return fixed_name if fixed_name else author
@@ -237,6 +277,9 @@ class CitationGenerator:
             return author
     async def format_bibtex_author_names(self, text: str) -> str:
         try:
             bib_database = bibtexparser.loads(text)
             for entry in bib_database.entries:
@@ -247,15 +290,16 @@ class CitationGenerator:
                         fixed_author = await self.fix_author_name(author)
                         cleaned_authors.append(fixed_author)
                     entry['author'] = ' and '.join(cleaned_authors)
-            writer = BibTexWriter()
-            writer.indent = '    '
-            writer.comma_first = False
             return writer.write(bib_database).strip()
         except Exception as e:
             logger.error(f"Error cleaning BibTeX special characters: {e}")
             return text
-    async def search_crossref(self, session: aiohttp.ClientSession, query: str, max_results: int) -> List[Dict]:
         try:
             cleaned_query = query.replace("'", "").replace('"', "")
             if ' ' in cleaned_query:
@@ -316,7 +360,6 @@ class CitationGenerator:
                                         continue
                                     bibtex_text = await bibtex_response.text()
                                     bib_database = bibtexparser.loads(bibtex_text)
                                     if not bib_database.entries:
                                         continue
@@ -335,9 +378,7 @@ class CitationGenerator:
                                     entry['ID'] = key
                                     existing_keys.add(key)
-                                    writer = BibTexWriter()
-                                    writer.indent = '    '
-                                    writer.comma_first = False
                                     formatted_bibtex = writer.write(bib_database).strip()
                                     papers.append({
@@ -364,7 +405,10 @@ class CitationGenerator:
             logger.error(f"Error searching CrossRef: {e}")
             return []
-    def _generate_unique_bibtex_key(self, entry: Dict, existing_keys: set) -> str:
         entry_type = entry.get('ENTRYTYPE', '').lower()
         author_field = entry.get('author', '')
         year = entry.get('year', '')
@@ -373,10 +417,10 @@ class CitationGenerator:
         if entry_type == 'inbook':
             booktitle = entry.get('booktitle', '')
-            title_word = re.sub(r'\W+', '', booktitle.split()[0]) if booktitle else 'untitled'
         else:
             title = entry.get('title', '')
-            title_word = re.sub(r'\W+', '', title.split()[0]) if title else 'untitled'
         base_key = f"{first_author_last_name}{year}{title_word}"
         key = base_key
@@ -387,17 +431,20 @@ class CitationGenerator:
         return key
     async def process_text(self, text: str, num_queries: int, citations_per_query: int,
-                           use_arxiv: bool = True, use_crossref: bool = True) -> tuple[str, str, str]:
         if not (use_arxiv or use_crossref):
             return "Please select at least one source (ArXiv or CrossRef)", "", ""
         num_queries = min(max(1, num_queries), self.config.max_queries)
         citations_per_query = min(max(1, citations_per_query), self.config.max_citations_per_query)
-        async def generate_queries_tool(input_data: dict):
             return await self.generate_queries(input_data["text"], input_data["num_queries"])
-        async def search_papers_tool(input_data: dict):
             queries = input_data["queries"]
             papers = []
             async with self.async_context as session:
@@ -411,7 +458,7 @@ class CitationGenerator:
             for r in results:
                 if not isinstance(r, Exception):
                     papers.extend(r)
-            # Deduplicate
             unique_papers = []
             seen_keys = set()
             for p in papers:
@@ -420,7 +467,7 @@ class CitationGenerator:
                     unique_papers.append(p)
             return unique_papers
-        async def cite_text_tool(input_data: dict):
             try:
                 citation_input = {
                     "text": input_data["text"],
@@ -430,7 +477,6 @@ class CitationGenerator:
                 response = await self.llm.ainvoke(prompt)
                 cited_text = response.content.strip()
-                # Aggregate BibTeX entries
                 bib_database = BibDatabase()
                 for p in input_data["papers"]:
                     if 'bibtex_entry' in p:
@@ -439,16 +485,14 @@ class CitationGenerator:
                             bib_database.entries.append(bib_db.entries[0])
                         else:
                             logger.warning(f"Empty BibTeX entry for key: {p['bibtex_key']}")
-                writer = BibTexWriter()
-                writer.indent = '    '
-                writer.comma_first = False
                 bibtex_entries = writer.write(bib_database).strip()
                 return cited_text, bibtex_entries
             except Exception as e:
                 logger.error(f"Error inserting citations: {e}")
                 return input_data["text"], ""
-        async def agent_run(input_data: dict) -> tuple[str, str, str]:
             queries = await generate_queries_tool(input_data)
             papers = await search_papers_tool({
                 "queries": queries,
@@ -473,9 +517,13 @@ class CitationGenerator:
         })
         return final_text, final_bibtex, final_queries
 def create_gradio_interface() -> gr.Interface:
     async def process(api_key: str, text: str, num_queries: int, citations_per_query: int,
-                     use_arxiv: bool, use_crossref: bool) -> tuple[str, str, str]:
         if not api_key.strip():
             return "Please enter your Gemini API Key.", "", ""
         if not text.strip():
@@ -494,14 +542,14 @@ def create_gradio_interface() -> gr.Interface:
     css = """
         :root {
-            /* Modern, sophisticated color palette */
             --primary-bg: #F8F9FA;
             --secondary-bg: #FFFFFF;
-            --accent-1: #4A90E2;    /* Refined blue */
-            --accent-2: #50C878;    /* Emerald green */
-            --accent-3: #F5B041;    /* Warm orange */
-            --text-primary: #2C3E50; /* Deep blue-gray */
-            --text-secondary: #566573; /* Medium gray */
             --border: #E5E7E9;
             --shadow: rgba(0, 0, 0, 0.1);
         }
@@ -690,6 +738,7 @@ def create_gradio_interface() -> gr.Interface:
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
     try:

 import re
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Any, Tuple
 import sys
 from loguru import logger
 import gradio as gr
 from langchain.prompts import PromptTemplate
 from langchain_google_genai import ChatGoogleGenerativeAI
 import bibtexparser
 from bibtexparser.bwriter import BibTexWriter
 from bibtexparser.bibdatabase import BibDatabase
+def get_bibtex_writer() -> BibTexWriter:
+    """
+    Create and return a configured BibTexWriter instance.
+    """
+    writer = BibTexWriter()
+    writer.indent = '    '
+    writer.comma_first = False
+    return writer
 @dataclass
 class Config:
     gemini_api_key: str
     max_citations_per_query: int = 10
     arxiv_base_url: str = 'http://export.arxiv.org/api/query?'
     crossref_base_url: str = 'https://api.crossref.org/works'
+    default_headers: Dict[str, str] = field(default_factory=lambda: {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
     })
     log_level: str = 'DEBUG'
 class ArxivXmlParser:
+    """
+    Class to parse ArXiv XML responses.
+    """
     NS = {
         'atom': 'http://www.w3.org/2005/Atom',
         'arxiv': 'http://arxiv.org/schemas/atom'
     }
+    def parse_papers(self, data: str) -> List[Dict[str, Any]]:
+        """
+        Parse ArXiv XML data and return a list of paper dictionaries.
+        """
         try:
             root = ET.fromstring(data)
             papers = []
             logger.error(f"Error parsing ArXiv XML: {e}")
             return []
+    def parse_entry(self, entry: ET.Element) -> Optional[Dict[str, Any]]:
+        """
+        Parse a single ArXiv entry element and return a dictionary with paper details.
+        """
         try:
             title_node = entry.find('atom:title', self.NS)
             if title_node is None:
                 return None
+            title = title_node.text.strip() if title_node.text else ""
             authors = []
             for author in entry.findall('atom:author', self.NS):
                     authors.append(self._format_author_name(author_name_node.text.strip()))
             arxiv_id_node = entry.find('atom:id', self.NS)
+            if arxiv_id_node is None or not arxiv_id_node.text:
                 return None
             arxiv_id = arxiv_id_node.text.split('/')[-1]
             published_node = entry.find('atom:published', self.NS)
+            year = published_node.text[:4] if (published_node is not None and published_node.text) else "Unknown"
             abstract_node = entry.find('atom:summary', self.NS)
+            abstract = abstract_node.text.strip() if (abstract_node is not None and abstract_node.text) else ""
             bibtex_key = f"{authors[0].split(',')[0]}{arxiv_id.replace('.', '')}" if authors else f"unknown{arxiv_id.replace('.', '')}"
             bibtex_entry = self._generate_bibtex_entry(bibtex_key, title, authors, arxiv_id, year)
     @staticmethod
     def _format_author_name(author: str) -> str:
+        """
+        Format an author name as 'Lastname, Firstname'.
+        """
         names = author.split()
         if len(names) > 1:
             return f"{names[-1]}, {' '.join(names[:-1])}"
         return author
     def _generate_bibtex_entry(self, key: str, title: str, authors: List[str], arxiv_id: str, year: str) -> str:
+        """
+        Generate a BibTeX entry for a paper.
+        """
         db = BibDatabase()
         db.entries = [{
             'ENTRYTYPE': 'article',
             'journal': f'arXiv preprint arXiv:{arxiv_id}',
             'year': year
         }]
+        writer = get_bibtex_writer()
         return writer.write(db).strip()
 class AsyncContextManager:
+    """
+    Asynchronous context manager to handle aiohttp ClientSession.
+    """
+    async def __aenter__(self) -> aiohttp.ClientSession:
         self._session = aiohttp.ClientSession()
         return self._session
         if self._session:
             await self._session.close()
 class CitationGenerator:
+    """
+    Class that handles generating citations using AI and searching for academic papers.
+    """
+    def __init__(self, config: Config) -> None:
         self.config = config
         self.xml_parser = ArxivXmlParser()
         self.async_context = AsyncContextManager()
         self.llm = ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash",
             temperature=0.3,
             google_api_key=config.gemini_api_key,
             streaming=True
         logger.add(sys.stderr, level=config.log_level)
     async def generate_queries(self, text: str, num_queries: int) -> List[str]:
+        """
+        Generate a list of academic search queries from the input text.
+        """
         input_map = {
             "text": text,
             "num_queries": num_queries
                 lines = [line.strip() for line in content.split('\n')
                         if line.strip() and not line.strip().startswith(('[', ']'))]
                 return lines[:num_queries]
             return ["deep learning neural networks"]
         except Exception as e:
             logger.error(f"Error generating queries: {e}")
             return ["deep learning neural networks"]
+    async def search_arxiv(self, session: aiohttp.ClientSession, query: str, max_results: int) -> List[Dict[str, Any]]:
+        """
+        Search ArXiv for papers matching the query.
+        """
         try:
             params = {
                 'search_query': f'all:{urllib.parse.quote(query)}',
                 'sortBy': 'relevance',
                 'sortOrder': 'descending'
             }
+            url = self.config.arxiv_base_url + urllib.parse.urlencode(params)
             async with session.get(
+                url,
                 headers=self.config.default_headers,
                 timeout=30
             ) as response:
             return []
     async def fix_author_name(self, author: str) -> str:
+        """
+        Correct an author name that contains corrupted characters.
+        """
         if not re.search(r'[�]', author):
             return author
         try:
             prompt = f"""Fix this author name that contains corrupted characters (�):
+                        Name: {author}
+                        Requirements:
+                        1. Return ONLY the fixed author name
+                        2. Use proper diacritical marks for names
+                        3. Consider common name patterns and languages
+                        4. If unsure, use the most likely letter
+                        5. Maintain the format: "Lastname, Firstname"
+                        """
             response = await self.llm.ainvoke(prompt)
             fixed_name = response.content.strip()
             return fixed_name if fixed_name else author
             return author
     async def format_bibtex_author_names(self, text: str) -> str:
+        """
+        Clean and format author names in a BibTeX string.
+        """
         try:
             bib_database = bibtexparser.loads(text)
             for entry in bib_database.entries:
                         fixed_author = await self.fix_author_name(author)
                         cleaned_authors.append(fixed_author)
                     entry['author'] = ' and '.join(cleaned_authors)
+            writer = get_bibtex_writer()
             return writer.write(bib_database).strip()
         except Exception as e:
             logger.error(f"Error cleaning BibTeX special characters: {e}")
             return text
+    async def search_crossref(self, session: aiohttp.ClientSession, query: str, max_results: int) -> List[Dict[str, Any]]:
+        """
+        Search CrossRef for papers matching the query.
+        """
         try:
             cleaned_query = query.replace("'", "").replace('"', "")
             if ' ' in cleaned_query:
                                         continue
                                     bibtex_text = await bibtex_response.text()
                                     bib_database = bibtexparser.loads(bibtex_text)
                                     if not bib_database.entries:
                                         continue
                                     entry['ID'] = key
                                     existing_keys.add(key)
+                                    writer = get_bibtex_writer()
                                     formatted_bibtex = writer.write(bib_database).strip()
                                     papers.append({
             logger.error(f"Error searching CrossRef: {e}")
             return []
+    def _generate_unique_bibtex_key(self, entry: Dict[str, Any], existing_keys: set) -> str:
+        """
+        Generate a unique BibTeX key for an entry.
+        """
         entry_type = entry.get('ENTRYTYPE', '').lower()
         author_field = entry.get('author', '')
         year = entry.get('year', '')
         if entry_type == 'inbook':
             booktitle = entry.get('booktitle', '')
+            title_word = re.sub(r'\W+', '', booktitle.split()[0]) if booktitle.split() else 'untitled'
         else:
             title = entry.get('title', '')
+            title_word = re.sub(r'\W+', '', title.split()[0]) if title.split() else 'untitled'
         base_key = f"{first_author_last_name}{year}{title_word}"
         key = base_key
         return key
     async def process_text(self, text: str, num_queries: int, citations_per_query: int,
+                           use_arxiv: bool = True, use_crossref: bool = True) -> Tuple[str, str, str]:
+        """
+        Process the input text to generate citations and corresponding BibTeX entries.
+        """
         if not (use_arxiv or use_crossref):
             return "Please select at least one source (ArXiv or CrossRef)", "", ""
         num_queries = min(max(1, num_queries), self.config.max_queries)
         citations_per_query = min(max(1, citations_per_query), self.config.max_citations_per_query)
+        async def generate_queries_tool(input_data: Dict[str, Any]) -> List[str]:
             return await self.generate_queries(input_data["text"], input_data["num_queries"])
+        async def search_papers_tool(input_data: Dict[str, Any]) -> List[Dict[str, Any]]:
             queries = input_data["queries"]
             papers = []
             async with self.async_context as session:
             for r in results:
                 if not isinstance(r, Exception):
                     papers.extend(r)
+            # Remove duplicate papers
             unique_papers = []
             seen_keys = set()
             for p in papers:
                     unique_papers.append(p)
             return unique_papers
+        async def cite_text_tool(input_data: Dict[str, Any]) -> Tuple[str, str]:
             try:
                 citation_input = {
                     "text": input_data["text"],
                 response = await self.llm.ainvoke(prompt)
                 cited_text = response.content.strip()
                 bib_database = BibDatabase()
                 for p in input_data["papers"]:
                     if 'bibtex_entry' in p:
                             bib_database.entries.append(bib_db.entries[0])
                         else:
                             logger.warning(f"Empty BibTeX entry for key: {p['bibtex_key']}")
+                writer = get_bibtex_writer()
                 bibtex_entries = writer.write(bib_database).strip()
                 return cited_text, bibtex_entries
             except Exception as e:
                 logger.error(f"Error inserting citations: {e}")
                 return input_data["text"], ""
+        async def agent_run(input_data: Dict[str, Any]) -> Tuple[str, str, str]:
             queries = await generate_queries_tool(input_data)
             papers = await search_papers_tool({
                 "queries": queries,
         })
         return final_text, final_bibtex, final_queries
 def create_gradio_interface() -> gr.Interface:
+    """
+    Create and return a Gradio interface for the citation generator.
+    """
     async def process(api_key: str, text: str, num_queries: int, citations_per_query: int,
+                      use_arxiv: bool, use_crossref: bool) -> Tuple[str, str, str]:
         if not api_key.strip():
             return "Please enter your Gemini API Key.", "", ""
         if not text.strip():
     css = """
         :root {
+            /* Modern color palette */
             --primary-bg: #F8F9FA;
             --secondary-bg: #FFFFFF;
+            --accent-1: #4A90E2;
+            --accent-2: #50C878;
+            --accent-3: #F5B041;
+            --text-primary: #2C3E50;
+            --text-secondary: #566573;
             --border: #E5E7E9;
             --shadow: rgba(0, 0, 0, 0.1);
         }
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
     try: