Spaces:

kenton-li
/

ChatArxiv

Runtime error

App Files Files Community

kenton-li commited on Apr 28, 2023

Commit

13e969c

1 Parent(s): 1a8b486

Upload 6 files

Browse files

Files changed (6) hide show

assets/blank.pdf +0 -0
assets/pic.png +0 -0
src/optimizeOpenAI.py +233 -0
src/paper.py +121 -0
src/reader.py +109 -0
src/utils.py +5 -0

assets/blank.pdf ADDED Viewed

Binary file (41.2 kB). View file

assets/pic.png ADDED Viewed

src/optimizeOpenAI.py ADDED Viewed

	@@ -0,0 +1,233 @@

+"""
+A simple wrapper for the official ChatGPT API
+"""
+import json
+import os
+import threading
+import time
+import requests
+import tiktoken
+from typing import Generator
+from queue import PriorityQueue as PQ
+import json
+import os
+import time
+class chatPaper:
+    """
+    Official ChatGPT API
+    """
+    def __init__(
+        self,
+        api_keys: list,
+        proxy = None,
+        api_proxy = None,
+        max_tokens: int = 4000,
+        temperature: float = 0.5,
+        top_p: float = 1.0,
+        model_name: str = "gpt-3.5-turbo",
+        reply_count: int = 1,
+        system_prompt = "You are ChatArxiv, A paper reading bot",
+        lastAPICallTime = time.time()-100,
+        apiTimeInterval = 20,
+    ) -> None:
+        self.model_name = model_name
+        self.system_prompt = system_prompt
+        self.apiTimeInterval = apiTimeInterval
+        self.session = requests.Session()
+        self.api_keys = PQ()
+        for key in api_keys:
+            self.api_keys.put((lastAPICallTime,key))
+        self.proxy = proxy
+        if self.proxy:
+            proxies = {
+                "http": self.proxy,
+                "https": self.proxy,
+            }
+            self.session.proxies = proxies
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.top_p = top_p
+        self.reply_count = reply_count
+        self.decrease_step = 250
+        self.conversation = {}
+        self.ENCODER = tiktoken.get_encoding("gpt2")
+        if self.token_str(self.system_prompt) > self.max_tokens:
+            raise Exception("System prompt is too long")
+        self.lock = threading.Lock()
+    def get_api_key(self):
+        with self.lock:
+            apiKey = self.api_keys.get()
+            delay = self._calculate_delay(apiKey)
+            time.sleep(delay)
+            self.api_keys.put((time.time(), apiKey[1]))
+            return apiKey[1]
+    def _calculate_delay(self, apiKey):
+        elapsed_time = time.time() - apiKey[0]
+        if elapsed_time < self.apiTimeInterval:
+            return self.apiTimeInterval - elapsed_time
+        else:
+            return 0
+    def add_to_conversation(self, message: str, role: str, convo_id: str = "default"):
+        if(convo_id not in self.conversation):
+            self.reset(convo_id)
+        self.conversation[convo_id].append({"role": role, "content": message})
+    def __truncate_conversation(self, convo_id: str = "default"):
+        """
+        Truncate the conversation
+        """
+        last_dialog = self.conversation[convo_id][-1]
+        query = str(last_dialog['content'])
+        if(len(self.ENCODER.encode(str(query)))>self.max_tokens):
+            query = query[:int(1.5*self.max_tokens)]
+        while(len(self.ENCODER.encode(str(query)))>self.max_tokens):
+            query = query[:self.decrease_step]
+        self.conversation[convo_id] = self.conversation[convo_id][:-1]
+        full_conversation = "\n".join([str(x["content"]) for x in self.conversation[convo_id]],)
+        if len(self.ENCODER.encode(full_conversation)) > self.max_tokens:
+            self.conversation_summary(convo_id=convo_id)
+        full_conversation = ""
+        for x in self.conversation[convo_id]:
+            full_conversation = str(x["content"]) + "\n" + full_conversation
+        while True:
+            if (len(self.ENCODER.encode(full_conversation+query)) > self.max_tokens):
+                query = query[:self.decrease_step]
+            else:
+                break
+        last_dialog['content'] = str(query)
+        self.conversation[convo_id].append(last_dialog)
+    def ask_stream(
+        self,
+        prompt: str,
+        role: str = "user",
+        convo_id: str = "default",
+        **kwargs,
+    ) -> Generator:
+        if convo_id not in self.conversation.keys():
+            self.reset(convo_id=convo_id)
+        self.add_to_conversation(prompt, "user", convo_id=convo_id)
+        self.__truncate_conversation(convo_id=convo_id)
+        apiKey = self.get_api_key()
+        response = self.session.post(
+            "https://api.openai.com/v1/chat/completions",
+            headers={"Authorization": f"Bearer {kwargs.get('api_key', apiKey)}"},
+            json={
+                "model": self.model_name,
+                "messages": self.conversation[convo_id],
+                "stream": True,
+                # kwargs
+                "temperature": kwargs.get("temperature", self.temperature),
+                "top_p": kwargs.get("top_p", self.top_p),
+                "n": kwargs.get("n", self.reply_count),
+                "user": role,
+            },
+            stream=True,
+        )
+        if response.status_code != 200:
+            raise Exception(
+                f"Error: {response.status_code} {response.reason} {response.text}",
+            )
+        for line in response.iter_lines():
+            if not line:
+                continue
+            # Remove "data: "
+            line = line.decode("utf-8")[6:]
+            if line == "[DONE]":
+                break
+            resp: dict = json.loads(line)
+            choices = resp.get("choices")
+            if not choices:
+                continue
+            delta = choices[0].get("delta")
+            if not delta:
+                continue
+            if "content" in delta:
+                content = delta["content"]
+                yield content
+    def ask(self, prompt: str, role: str = "user", convo_id: str = "default", **kwargs):
+        """
+        Non-streaming ask
+        """
+        response = self.ask_stream(
+            prompt=prompt,
+            role=role,
+            convo_id=convo_id,
+            **kwargs,
+        )
+        full_response: str = "".join(response)
+        self.add_to_conversation(full_response, role, convo_id=convo_id)
+        usage_token = self.token_str(prompt)
+        com_token = self.token_str(full_response)
+        total_token = self.token_cost(convo_id=convo_id)
+        return full_response, usage_token, com_token, total_token
+    def check_api_available(self):
+        response = self.session.post(
+            "https://api.openai.com/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.get_api_key()}"},
+            json={
+                "model": self.engine,
+                "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "print A"}],
+                "stream": True,
+                # kwargs
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "n": self.reply_count,
+                "user": "user",
+            },
+            stream=True,
+        )
+        if response.status_code == 200:
+            return True
+        else:
+            return False
+    def reset(self, convo_id: str = "default", system_prompt = None):
+        """
+        Reset the conversation
+        """
+        self.conversation[convo_id] = [
+            {"role": "system", "content": str(system_prompt or self.system_prompt)},
+        ]
+    def conversation_summary(self, convo_id: str = "default"):
+        input = ""
+        role = ""
+        for conv in self.conversation[convo_id]:
+            if (conv["role"]=='user'):
+                role = 'User'
+            else:
+                role = 'ChatGpt'
+            input+=role+' : '+conv['content']+'\n'
+        prompt = "Your goal is to summarize the provided conversation. Your summary should be concise and focus on the key information to facilitate better dialogue for the large language model.Ensure that you include all necessary details and relevant information while still reducing the length of the conversation as much as possible. Your summary should be clear and easily understandable for the ChatGpt model providing a comprehensive and concise summary of the conversation."
+        if(self.token_str(str(input)+prompt)>self.max_tokens):
+            input = input[self.token_str(str(input))-self.max_tokens:]
+        while self.token_str(str(input)+prompt)>self.max_tokens:
+            input = input[self.decrease_step:]
+        prompt = prompt.replace("{conversation}", input)
+        self.reset(convo_id='conversationSummary')
+        response = self.ask(prompt, convo_id='conversationSummary')
+        while self.token_str(str(response))>self.max_tokens:
+            response = response[:-self.decrease_step]
+        self.reset(convo_id='conversationSummary',system_prompt='Summariaze')
+        self.conversation[convo_id] = [
+            {"role": "system", "content": self.system_prompt},
+            {"role": "user", "content": "Summariaze"},
+            {"role": 'assistant', "content": response},
+        ]
+        return self.conversation[convo_id]
+    def token_cost(self,convo_id: str = "default"):
+        return len(self.ENCODER.encode("\n".join([x["content"] for x in self.conversation[convo_id]])))
+    def token_str(self, content:str):
+        return len(self.ENCODER.encode(content))
+def main():
+    return

src/paper.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import fitz
+import os
+import io
+import arxiv
+import tempfile
+from PIL import Image
+from urllib.parse import urlparse
+class Paper:
+    def __init__(self, url=''):
+        self.url =  url
+        self.parse_url()
+        self.get_pdf()
+        self.paper_instance = {
+            'title': self.paper_arxiv.title,
+            'authors': self.paper_arxiv.authors,
+            'arxiv_id': self.paper_id,
+            'abstract': self.paper_arxiv.summary,
+            'pdf_url': self.paper_arxiv.pdf_url,
+            'categories': self.paper_arxiv.categories,
+            'published': self.paper_arxiv.published,
+            'updated': self.paper_arxiv.updated,
+            'content': {}
+        }
+        self.parse_pdf()
+    def get_paper(self):
+        return self.paper_instance
+    def parse_url(self):
+        self.url = self.url.replace('.pdf', '')
+        parsed_url = urlparse(self.url)
+        paper_id = os.path.basename(parsed_url.path)
+        self.paper_id = paper_id
+    def get_pdf(self):
+        search = arxiv.Search(id_list=[self.paper_id], max_results=1)
+        results = search.results()
+        paper_arxiv = next(results)
+        if paper_arxiv:
+            # with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
+            paper_path = f'{self.paper_id}.pdf'
+            dir_path = "./pdf"
+            os.makedirs(dir_path, exist_ok=True)
+            save_dir = os.path.join(dir_path, paper_path)
+            if not os.path.exists(save_dir):
+                paper_arxiv.download_pdf(dirpath=dir_path, filename=paper_path)
+            self.paper_arxiv = paper_arxiv
+            self.path = save_dir
+        else:
+            raise Exception("无法找到论文，请检查 URL 是否正确。")
+    def parse_pdf(self):
+        self.pdf = fitz.open(self.path)
+        self.text_list = [page.get_text() for page in self.pdf]
+        self.all_text = ' '.join(self.text_list)
+        self._parse_paper()
+        self.pdf.close()
+    def _get_sections(self):
+        sections = 'Abstract,Introduction,Related Work,Background,Preliminary,Problem Formulation,Methods,Methodology,Method,Approach,Approaches,Materials and Methods,Experiment Settings,Experiment,Experimental Results,Evaluation,Experiments,Results,Findings,Data Analysis,Discussion,Results and Discussion,Conclusion,References'
+        self.sections = sections.split(',')
+    def _get_all_page_index(self):
+        section_list = self.sections
+        section_page_dict = {}
+        for page_index, page in enumerate(self.pdf):
+            cur_text = page.get_text()
+            for section_name in section_list:
+                section_name_upper = section_name.upper()
+                if "Abstract" == section_name and section_name in cur_text:
+                    section_page_dict[section_name] = page_index
+                    continue
+                if section_name + '\n' in cur_text:
+                    section_page_dict[section_name] = page_index
+                elif section_name_upper + '\n' in cur_text:
+                    section_page_dict[section_name] = page_index
+        self.section_page_dict = section_page_dict
+    def _parse_paper(self):
+        """
+        Return: dict { <Section Name>: <Content> }
+        """
+        self._get_sections()
+        self._get_all_page_index()
+        text_list = [page.get_text() for page in self.pdf]
+        section_keys = list(self.section_page_dict.keys())
+        section_count = len(section_keys)
+        section_dict = {}
+        for sec_index, sec_name in enumerate(section_keys):
+            if sec_index == 0:
+                continue
+            start_page = self.section_page_dict[sec_name]
+            end_page = self.section_page_dict[section_keys[sec_index + 1]] if sec_index < section_count - 1 else len(text_list)
+            cur_sec_text = []
+            for page_i in range(start_page, end_page):
+                page_text = text_list[page_i]
+                if page_i == start_page:
+                    start_i = page_text.find(sec_name) if sec_name in page_text else page_text.find(sec_name.upper())
+                    page_text = page_text[start_i:]
+                if page_i == end_page - 1 and sec_index < section_count - 1:
+                    next_sec = section_keys[sec_index + 1]
+                    end_i = page_text.find(next_sec) if next_sec in page_text else page_text.find(next_sec.upper())
+                    page_text = page_text[:end_i]
+                cur_sec_text.append(page_text)
+            section_dict[sec_name] = ''.join(cur_sec_text).replace('-\n', '').replace('\n', ' ')
+        self.paper_instance['content'] = section_dict

src/reader.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import re
+import numpy as np
+import tenacity
+import arxiv
+import markdown
+from .paper import Paper
+from .optimizeOpenAI import chatPaper
+class Reader:
+    def __init__(self,
+                 paper: Paper,
+                 api_key='',
+                 user_name='defualt',
+                 language='English'):
+        self.user_name = user_name
+        self.language = language
+        self.paper_instance = paper.get_paper()
+        self.chat_api_list = [api_key]
+        self.chatPaper = chatPaper(api_keys=self.chat_api_list, apiTimeInterval=10)
+        self.chatPaper.add_to_conversation(message="You are a professional academic paper reviewer and mentor named Arxiv Bot. As a professional academic paper reviewer and helpful mentor, you possess exceptional logical and critical thinking skills, enabling you to provide concise and insightful responses.", role='assistant', convo_id="chat")
+        self.chatPaper.add_to_conversation(message="You are not allowed to discuss anything about politics, do not comment on anything about that.", role='assistant', convo_id="chat")
+        self.chatPaper.add_to_conversation(message="You will be asked to answer questions about the paper with deep knowledge about it, providing clear and concise explanations in a helpful, friendly manner, using the asker's language.", role='user', convo_id="chat")
+        # Read Basic Info of the Paper
+        self._read_basic()
+    def _get_intro_prompt(self, intro_content: str = ''):
+        if intro_content == '':
+            intro_key = [k for k in self.paper_instance['content'].keys()][0]
+            intro_content = self.paper_instance['content'][intro_key]
+        prompt = (f"This is an academic paper from {self.paper_instance['categories']} fields,\n\
+                        Title of this paper are {self.paper_instance['title']}.\n\
+                        Authors of this paper are {self.paper_instance['authors']}.\n\
+                        Abstract of this paper is {self.paper_instance['abstract']}.\n\
+                        Introduction of this paper is {intro_content}.")
+        return prompt
+    def _init_prompt(self, convo_id: str = 'default'):
+        intro_content = ''
+        max_tokens = self.chatPaper.max_tokens
+        prompt = self._get_intro_prompt(intro_content)
+        full_conversation_ = "\n".join([str(x["content"]) for x in self.chatPaper.conversation[convo_id]],)
+        full_conversation = str(full_conversation_ + prompt)
+        # Try to summarize the intro part
+        if(len(self.chatPaper.ENCODER.encode(str(full_conversation)))>max_tokens):
+            prompt = f'This is the introduction, please summarize it and reduct its length in {max_tokens} tokens: {prompt}'
+            intro_content = self._summarize_content(prompt)
+            prompt = self._get_intro_prompt(intro_content)
+            full_conversation = str(full_conversation_ + prompt)
+        # Failed, try to reduce the length of the prompt
+        while(len(self.chatPaper.ENCODER.encode(str(full_conversation)))>max_tokens):
+            prompt = prompt[:self.chatPaper.decrease_step]
+            full_conversation = str(full_conversation_ + prompt)
+        return prompt
+    def _summarize_content(self, content: str = ''):
+        sys_prompt = "Your goal is to summarize the provided content from an academic paper. Your summary should be concise and focus on the key information of the academic paper, do not miss any important point."
+        self.chatPaper.reset(convo_id='summary', system_prompt=sys_prompt)
+        response = self.chatPaper.ask(content, convo_id='summary')
+        res_txt = str(response[0])
+        return res_txt
+    def get_basic_info(self):
+        prompt = f'Introduce this paper (its not necessary to include the basic information like title and author name), comment on this paper based on its abstract and introduction from its 1. Novelty, 2. Improtance, 3. Potential Influence. Relpy in {self.language}'
+        basic_op = self.chatPaper.ask(prompt, convo_id='chat')[0]
+        return basic_op
+    def _read_basic(self, convo_id="chat"):
+        prompt = self._init_prompt(convo_id)
+        self.chatPaper.add_to_conversation(
+            convo_id=convo_id,
+            role="assistant",
+            message= prompt
+        )
+    def read_paper(self, chapter_list: list = [], convo_id="chat"):
+        for chap in chapter_list:
+            prompt = self.paper_instance['content'][chap]
+            sys_prompt = f'This is the {chap} section of this paper, please read carefully and answer the users questions professionally and friendly basic on the content.\n'
+            prompt = sys_prompt + prompt
+            self.chatPaper.add_to_conversation(
+                convo_id=convo_id,
+                role="assistant",
+                message= prompt
+            )
+        return "我读完了这些章节，让我们开始吧! 🤩"
+    @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
+                    stop=tenacity.stop_after_attempt(5),
+                    reraise=True)
+    def chat_with_paper(self, prompt):
+        result = self.chatPaper.ask(
+            prompt = prompt,
+            role="user",
+            convo_id="chat",
+        )
+        reply = str(result[0])
+        return reply

src/utils.py ADDED Viewed

	@@ -0,0 +1,5 @@

+language_dict = {
+    'zh': '中文',
+    'en': 'English',
+}