Spaces:

serhan
/

i135e1fi414i41tqe

Runtime error

App Files Files Community

serhan commited on May 28, 2023

Commit

14e11d6

1 Parent(s): bd35fac

Upload 16 files

Browse files

Files changed (16) hide show

README.md +5 -6
ai.py +199 -0
api.py +117 -0
app.py +26 -0
config.json +14 -0
config.py +36 -0
console.py +127 -0
contents.py +81 -0
index/bkp/dd771cb6c4718ace4c4c596f4792cfdd.bin +3 -0
index/bkp/dd771cb6c4718ace4c4c596f4792cfdd.csv +0 -0
index/dd771cb6c4718ace4c4c596f4792cfdd.bin +3 -0
index/dd771cb6c4718ace4c4c596f4792cfdd.csv +0 -0
main.py +26 -0
requirements.txt +79 -0
storage.py +171 -0
webui.py +71 -0

README.md CHANGED Viewed

@@ -1,12 +1,11 @@
 ---
-title: I135e1fi414i41tqe
-emoji: 😻
-colorFrom: yellow
-colorTo: gray
 sdk: gradio
 sdk_version: 3.32.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+python_version: 3.10.6
+title: Kanunasor
+emoji: 🏆
+colorFrom: red
+colorTo: blue
 sdk: gradio
 sdk_version: 3.32.0
 app_file: app.py
 pinned: false
 ---

ai.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import numpy as np
+import openai
+import tiktoken
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from config import Config
+class AI:
+    """The AI class."""
+    def __init__(self, cfg: Config):
+        openai.api_key = cfg.open_ai_key
+        openai.proxy = cfg.open_ai_proxy
+        self._chat_model = cfg.open_ai_chat_model
+        self._use_stream = cfg.use_stream
+        self._encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
+        self._language = cfg.language
+        self._temperature = cfg.temperature
+    def _chat_stream(self, messages: list[dict], use_stream: bool = None) -> str:
+        use_stream = use_stream if use_stream is not None else self._use_stream
+        response = openai.ChatCompletion.create(
+            temperature=self._temperature,
+            stream=use_stream,
+            model=self._chat_model,
+            messages=messages,
+        )
+        if use_stream:
+            data = ""
+            for chunk in response:
+                if chunk.choices[0].delta.get('content', None) is not None:
+                    data += chunk.choices[0].delta.content
+                    print(chunk.choices[0].delta.content, end='')
+            print()
+            return data.strip()
+        else:
+            print(response.choices[0].message.content.strip())
+            print(f"Total tokens used: {response.usage.total_tokens}, "
+                  f"cost: ${response.usage.total_tokens / 1000 * 0.002}")
+            return response.choices[0].message.content.strip()
+    def _num_tokens_from_string(self, string: str) -> int:
+        """Returns the number of tokens in a text string."""
+        num_tokens = len(self._encoding.encode(string))
+        return num_tokens
+    def completion(self, query: str, context: list[str]):
+        """Create a completion."""
+        context = self._cut_texts(context)
+        print(f"Number of query fragments:{len(context)}")
+        text = "\n".join(f"{index}. {text}" for index, text in enumerate(context))
+        result = self._chat_stream([
+            {'role': 'system',
+              'content': f'You are a helpful AI article assistant. '
+                        f'The following are the relevant article content fragments found from the article. '
+                        f'The relevance is sorted from high to low. '
+                        f'You can only answer according to the following content:\n```\n{text}\n```\n'
+                        f'You need to carefully consider your answer to ensure that it is based on the context. '
+                        f'If the context does not mention the content or it is uncertain whether it is correct, '
+                        f'please answer "Bu bilgiye tam olarak hakim değilim, lütfen uzmanlarımıza danışın. Başka bir soru sorabilirsiniz."'
+                        f'You must use {self._language} to respond.'},
+            {'role': 'user', 'content': query},
+        ])
+        return result
+    def _cut_texts(self, context):
+        maximum = 4096 - 1024
+        for index, text in enumerate(context):
+            maximum -= self._num_tokens_from_string(text)
+            if maximum < 0:
+                context = context[:index + 1]
+                print(f"Exceeded maximum length, cut the first {index + 1} fragments")
+                break
+        return context
+    def get_keywords(self, query: str) -> str:
+        """Get keywords from the query."""
+        result = self._chat_stream([
+            {'role': 'user',
+             'content': f'You need to extract keywords from the statement or question and '
+                        f'return a series of keywords separated by commas.\ncontent: {query}\nkeywords: '},
+        ], use_stream=False)
+        return result
+    @staticmethod
+    def create_embedding(text: str) -> (str, list[float]):
+        """Create an embedding for the provided text."""
+        embedding = openai.Embedding.create(model="text-embedding-ada-002", input=text)
+        return text, embedding.data[0].embedding
+    def create_embeddings(self, texts: list[str]) -> (list[tuple[str, list[float]]], int):
+        """Create embeddings for the provided input."""
+        result = []
+        query_len = 0
+        start_index = 0
+        tokens = 0
+        def get_embedding(input_slice: list[str]):
+            embedding = openai.Embedding.create(model="text-embedding-ada-002", input=input_slice)
+            return [(txt, data.embedding) for txt, data in
+                    zip(input_slice, embedding.data)], embedding.usage.total_tokens
+        for index, text in enumerate(texts):
+            query_len += self._num_tokens_from_string(text)
+            if query_len > 8192 - 1024:
+                ebd, tk = get_embedding(texts[start_index:index + 1])
+                print(f"Query fragments used tokens: {tk}, cost: ${tk / 1000 * 0.0004}")
+                query_len = 0
+                start_index = index + 1
+                tokens += tk
+                result.extend(ebd)
+        if query_len > 0:
+            ebd, tk = get_embedding(texts[start_index:])
+            print(f"Query fragments used tokens: {tk}, cost: ${tk / 1000 * 0.0004}")
+            tokens += tk
+            result.extend(ebd)
+        return result, tokens
+    def generate_summary(self, embeddings, num_candidates=3, use_sif=False):
+        """Generate a summary for the provided embeddings."""
+        avg_func = self._calc_paragraph_avg_embedding_with_sif if use_sif else self._calc_avg_embedding
+        avg_embedding = np.array(avg_func(embeddings))
+        paragraphs = [e[0] for e in embeddings]
+        embeddings = np.array([e[1] for e in embeddings])
+        # 计算每个段落与整个文本的相似度分数
+        # Calculate the similarity score between each paragraph and the entire text.
+        similarity_scores = cosine_similarity(embeddings, avg_embedding.reshape(1, -1)).flatten()
+        # 选择具有最高相似度分数的段落作为摘要的候选段落
+        # Select the paragraph with the highest similarity score as the candidate paragraph for the summary.
+        candidate_indices = np.argsort(similarity_scores)[::-1][:num_candidates]
+        candidate_paragraphs = [f"paragraph {i}: {paragraphs[i]}" for i in candidate_indices]
+        print("Calculation completed, start generating summary")
+        candidate_paragraphs = self._cut_texts(candidate_paragraphs)
+        text = "\n".join(f"{index}. {text}" for index, text in enumerate(candidate_paragraphs))
+        result = self._chat_stream([
+            {'role': 'system',
+             'content': f'As a helpful AI article assistant, '
+                        f'I have retrieved the following relevant text fragments from the article, '
+                        f'sorted by relevance from high to low. '
+                        f'You need to summarize the entire article from these fragments, '
+                        f'and present the final result in {self._language}:\n\n{text}\n\n{self._language} summary:'},
+        ])
+        return result
+    @staticmethod
+    def _calc_avg_embedding(embeddings) -> list[float]:
+        # Calculate the average embedding for the entire text.
+        avg_embedding = np.zeros(len(embeddings[0][1]))
+        for emb in embeddings:
+            avg_embedding += np.array(emb[1])
+        avg_embedding /= len(embeddings)
+        return avg_embedding.tolist()
+    @staticmethod
+    def _calc_paragraph_avg_embedding_with_sif(paragraph_list) -> list[float]:
+        # calculate the SIF embedding for the entire text
+        alpha = 0.001
+        # calculate the total number of sentences
+        n_sentences = len(paragraph_list)
+        # calculate the total number of dimensions in the embeddings
+        n_dims = len(paragraph_list[0][1])
+        # calculate the IDF values for each word in the sentences
+        vectorizer = TfidfVectorizer(use_idf=True)
+        vectorizer.fit_transform([paragraph for paragraph, _ in paragraph_list])
+        idf = vectorizer.idf_
+        # calculate the SIF weights for each sentence
+        weights = np.zeros((n_sentences, n_dims))
+        for i, (sentence, embedding) in enumerate(paragraph_list):
+            sentence_words = sentence.split()
+            for word in sentence_words:
+                try:
+                    word_index = vectorizer.vocabulary_[word]
+                    word_idf = idf[word_index]
+                    word_weight = alpha / (alpha + word_idf)
+                    weights[i] += word_weight * (np.array(embedding) / np.max(embedding))
+                except KeyError:
+                    pass
+        # calculate the weighted average of the sentence embeddings
+        weights_sum = np.sum(weights, axis=0)
+        weights_sum /= n_sentences
+        avg_embedding = np.zeros(n_dims)
+        for i, (sentence, embedding) in enumerate(paragraph_list):
+            avg_embedding += (np.array(embedding) / np.max(embedding)) - weights[i]
+        avg_embedding /= n_sentences
+        return avg_embedding.tolist()

api.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import shutil
+import uvicorn
+import xxhash
+from fastapi import FastAPI, UploadFile, File
+from fastapi.exceptions import RequestValidationError
+from pydantic import BaseModel
+from starlette.exceptions import HTTPException
+from starlette.requests import Request
+from starlette.responses import JSONResponse
+from ai import AI
+from config import Config
+from contents import web_crawler_newspaper, extract_text_from_txt, extract_text_from_docx, \
+    extract_text_from_pdf
+from storage import Storage
+def api(cfg: Config):
+    """Run the API."""
+    cfg.use_stream = False
+    ai = AI(cfg)
+    app = FastAPI()
+    class CrawlerUrlRequest(BaseModel):
+        url: str
+    @app.post("/crawler_url")
+    async def crawler_url(req: CrawlerUrlRequest):
+        """Crawler the URL."""
+        contents, lang = web_crawler_newspaper(req.url)
+        hash_id = xxhash.xxh3_128_hexdigest('\n'.join(contents))
+        tokens = _save_to_storage(contents, hash_id)
+        return {"code": 0, "msg": "ok", "data": {"uri": f"{hash_id}/{lang}", "tokens": tokens}}
+    def _save_to_storage(contents, hash_id):
+        storage = Storage.create_storage(cfg)
+        if storage.been_indexed(hash_id):
+            return 0
+        else:
+            embeddings, tokens = ai.create_embeddings(contents)
+            storage.add_all(embeddings, hash_id)
+            return tokens
+    @app.post("/upload_file")
+    async def create_upload_file(file: UploadFile = File(...)):
+        """Upload file."""
+        # save file to disk
+        file_name = file.filename
+        os.makedirs('./upload', exist_ok=True)
+        upload_path = os.path.join('./upload', file_name)
+        with open(upload_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        if file_name.endswith('.pdf'):
+            contents, lang = extract_text_from_pdf(upload_path)
+        elif file_name.endswith('.txt'):
+            contents, lang = extract_text_from_txt(upload_path)
+        elif file_name.endswith('.docx'):
+            contents, lang = extract_text_from_docx(upload_path)
+        else:
+            return {"code": 1, "msg": "not support", "data": {}}
+        hash_id = xxhash.xxh3_128_hexdigest('\n'.join(contents))
+        tokens = _save_to_storage(contents, hash_id)
+        os.remove(upload_path)
+        return {"code": 0, "msg": "ok", "data": {"uri": f"{hash_id}/{lang}", "tokens": tokens}}
+    @app.get("/summary")
+    async def summary(uri: str):
+        """Generate summary."""
+        hash_id, lang = uri.split('/')
+        storage = Storage.create_storage(cfg)
+        if not storage or not lang:
+            return {"code": 1, "msg": "not found", "data": {}}
+        s = ai.generate_summary(storage.get_all_embeddings(hash_id), num_candidates=100,
+                                use_sif=lang not in ['zh', 'ja', 'ko', 'hi', 'ar', 'fa'])
+        return {"code": 0, "msg": "ok", "data": {"summary": s}}
+    class AnswerRequest(BaseModel):
+        uri: str
+        query: str
+    @app.get("/answer")
+    async def answer(req: AnswerRequest):
+        """Query."""
+        hash_id, lang = req.uri.split('/')
+        storage = Storage.create_storage(cfg)
+        if not storage or not lang:
+            return {"code": 1, "msg": "not found", "data": {}}
+        keywords = ai.get_keywords(req.query)
+        _, embedding = ai.create_embedding(keywords)
+        texts = storage.get_texts(embedding, hash_id)
+        s = ai.completion(req.query, texts)
+        return {"code": 0, "msg": "ok", "data": {"answer": s}}
+    @app.exception_handler(RequestValidationError)
+    async def validate_error_handler(request: Request, exc: RequestValidationError):
+        """Error handler."""
+        print("validate_error_handler: ", request.url, exc)
+        return JSONResponse(
+            status_code=400,
+            content={"code": 1, "msg": str(exc.errors()), "data": {}},
+        )
+    @app.exception_handler(HTTPException)
+    async def http_error_handler(request: Request, exc):
+        """Error handler."""
+        print("http error_handler: ", request.url, exc)
+        return JSONResponse(
+            status_code=400,
+            content={"code": 1, "msg": exc.detail, "data": {}},
+        )
+    # run the API
+    uvicorn.run(app, host=cfg.api_host, port=cfg.api_port)

app.py ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from api import api
+from config import Config
+from console import console
+from webui import webui
+def run():
+    """Run the program."""
+    cfg = Config()
+    mode = cfg.mode
+    if mode == 'console':
+        console(cfg)
+    elif mode == 'api':
+        api(cfg)
+    elif mode == 'webui':
+        webui(cfg)
+    else:
+        raise ValueError('mode must be console or api')
+if __name__ == '__main__':
+    run()

config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "temperature": 0.1,
+  "language": "Turkish",
+  "open_ai_chat_model": "gpt-3.5-turbo",
+  "use_stream": false,
+  "use_postgres": false,
+  "index_path": "./index",
+  "postgres_url": "postgresql://localhost:5432/mydb",
+  "mode": "webui",
+  "api_port": 9531,
+  "api_host": "localhost",
+  "webui_port": 7860,
+  "webui_host": "0.0.0.0"
+}

config.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import json
+import os
+class Config:
+    def __init__(self):
+        config_path = os.path.join(os.path.dirname(__file__), 'config.json')
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(f'config.json not found at {config_path}, '
+                                    f'please copy config.example.json to config.json and modify it.')
+        with open(config_path, 'r') as f:
+            self.open_ai_key = os.environ['open_ai_key']
+            self.config = json.load(f)
+            self.language = self.config.get('language', 'Chinese')
+            self.open_ai_proxy = self.config.get('open_ai_proxy')
+            self.open_ai_chat_model = self.config.get('open_ai_chat_model', 'gpt-3.5-turbo')
+            if not self.open_ai_key:
+                raise ValueError('open_ai_key is not set')
+            self.temperature = self.config.get('temperature', 0.1)
+            if self.temperature < 0 or self.temperature > 1:
+                raise ValueError('temperature must be between 0 and 1, less is more conservative, more is more creative')
+            self.use_stream = self.config.get('use_stream', False)
+            self.use_postgres = self.config.get('use_postgres', False)
+            if not self.use_postgres:
+                self.index_path = self.config.get('index_path', './temp')
+                os.makedirs(self.index_path, exist_ok=True)
+            self.postgres_url = self.config.get('postgres_url')
+            if self.use_postgres and self.postgres_url is None:
+                raise ValueError('postgres_url is not set')
+            self.mode = self.config.get('mode', 'webui')
+            if self.mode not in ['console', 'api', 'webui']:
+                raise ValueError('mode must be console or api or webui')
+            self.api_port = self.config.get('api_port', 9531)
+            self.api_host = self.config.get('api_host', 'localhost')
+            self.webui_port = self.config.get('webui_port', 7860)
+            self.webui_host = self.config.get('webui_host', '0.0.0.0')

console.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import xxhash
+from ai import AI
+from config import Config
+from storage import Storage
+from contents import *
+def console(cfg: Config):
+    try:
+        while True:
+            if not _console(cfg):
+                return
+    except KeyboardInterrupt:
+        print("exit")
+def _console(cfg: Config) -> bool:
+    """Run the console."""
+    contents, lang, identify = _get_contents()
+    print("The article has been retrieved, and the number of text fragments is:", len(contents))
+    for content in contents:
+        print('\t', content)
+    ai = AI(cfg)
+    storage = Storage.create_storage(cfg)
+    print("=====================================")
+    if storage.been_indexed(identify):
+        print("The article has already been indexed, so there is no need to index it again.")
+        print("=====================================")
+    else:
+        # 1. 对文章的每个段落生成embedding
+        # 1. Generate an embedding for each paragraph of the article.
+        embeddings, tokens = ai.create_embeddings(contents)
+        print(f"Embeddings have been created with {len(embeddings)} embeddings, using {tokens} tokens, "
+              f"costing ${tokens / 1000 * 0.0004}")
+        storage.add_all(embeddings, identify)
+        print("The embeddings have been saved.")
+        print("=====================================")
+    while True:
+        query = input("Please enter your query (/help to view commands):").strip()
+        if query.startswith("/"):
+            if query == "/quit":
+                return False
+            elif query == "/reset":
+                print("=====================================")
+                return True
+            elif query == "/summary":
+                # 生成embedding式摘要，根据不同的语言使用有基于SIF的加权平均或一般的直接求平均
+                # Generate an embedding-based summary, using weighted average based on SIF or direct average based on the language.
+                ai.generate_summary(storage.get_all_embeddings(identify), num_candidates=100,
+                                    use_sif=lang not in ['zh', 'ja', 'ko', 'hi', 'ar', 'fa'])
+            elif query == "/reindex":
+                # 重新索引，会清空数据库
+                # Re-index, which will clear the database.
+                storage.clear(identify)
+                embeddings, tokens = ai.create_embeddings(contents)
+                print(f"Embeddings have been created with {len(embeddings)} embeddings, using {tokens} tokens, "
+                      f"costing ${tokens / 1000 * 0.0004}")
+                storage.add_all(embeddings, identify)
+                print("The embeddings have been saved.")
+            elif query == "/help":
+                print("Enter /summary to generate an embedding-based summary.")
+                print("Enter /reindex to re-index the article.")
+                print("Enter /reset to start over.")
+                print("Enter /quit to exit.")
+                print("Enter any other content for a query.")
+            else:
+                print("Invalid command.")
+                print("Enter /summary to generate an embedding-based summary.")
+                print("Enter /reindex to re-index the article.")
+                print("Enter /reset to start over.")
+                print("Enter /quit to exit.")
+                print("Enter any other content for a query.")
+            print("=====================================")
+            continue
+        else:
+            # 1. 生成关键词
+            # 1. Generate keywords.
+            print("Generate keywords.")
+            keywords = ai.get_keywords(query)
+            # 2. 对问题生成embedding
+            # 2. Generate an embedding for the question.
+            _, embedding = ai.create_embedding(keywords)
+            # 3. 从数据库中找到最相似的片段
+            # 3. Find the most similar fragments from the database.
+            texts = storage.get_texts(embedding, identify)
+            print("Related fragments found (first 5):")
+            for text in texts[:5]:
+                print('\t', text)
+            # 4. 把相关片段推给AI，AI会根据这些片段回答问题
+            # 4. Push the relevant fragments to the AI, which will answer the question based on these fragments.
+            ai.completion(query, texts)
+            print("=====================================")
+def _get_contents() -> tuple[list[str], str, str]:
+    """Get the contents."""
+    while True:
+        try:
+            url = input("Please enter the link to the article or the file path of the PDF/TXT/DOCX document: ").strip()
+            if os.path.exists(url):
+                if url.endswith('.pdf'):
+                    contents, data = extract_text_from_pdf(url)
+                elif url.endswith('.txt'):
+                    contents, data = extract_text_from_txt(url)
+                elif url.endswith('.docx'):
+                    contents, data = extract_text_from_docx(url)
+                else:
+                    print("Unsupported file format.")
+                    continue
+            else:
+                contents, data = web_crawler_newspaper(url)
+            if not contents:
+                print("Unable to retrieve the content of the article. Please enter the link to the article or "
+                      "the file path of the PDF/TXT/DOCX document again.")
+                continue
+            return contents, data, xxhash.xxh3_128_hexdigest('\n'.join(contents))
+        except Exception as e:
+            print("Error:", e)

contents.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+import time
+import PyPDF2
+import docx
+import readability
+from langdetect import detect
+from newspaper import fulltext, Article
+from selenium import webdriver
+def web_crawler_newspaper(url: str) -> tuple[list[str], str]:
+    """Run the web crawler."""
+    raw_html, lang = _get_raw_html(url)
+    try:
+        text = fulltext(raw_html, language=lang)
+    except:
+        article = Article(url)
+        article.download()
+        article.parse()
+        text = article.text
+    contents = [text.strip() for text in text.splitlines() if text.strip()]
+    return contents, lang
+def _get_raw_html(url):
+    chrome_options = webdriver.ChromeOptions()
+    chrome_options.add_argument('--headless')
+    chrome_options.add_argument('--disable-gpu')
+    chrome_options.add_argument('--no-sandbox')
+    chrome_options.add_argument('--disable-dev-shm-usage')
+    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                                'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36')
+    with webdriver.Chrome(options=chrome_options) as driver:
+        driver.get(url)
+        print("Please wait for 5 seconds until the webpage finishes loading.")
+        time.sleep(5)
+        html = driver.page_source
+    doc = readability.Document(html)
+    html = doc.summary()
+    lang = detect(html)
+    return html, lang[0:2]
+def extract_text_from_pdf(file_path: str) -> tuple[list[str], str]:
+    """Extract text content from a PDF file."""
+    with open(file_path, 'rb') as f:
+        pdf_reader = PyPDF2.PdfReader(f)
+        contents = []
+        for page in pdf_reader.pages:
+            page_text = page.extract_text().strip()
+            raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
+            new_text = ''
+            for text in raw_text:
+                new_text += text
+                if text[-1] in ['.', '!', '?', '。', '！', '？', '…', ';', '；', ':', '：', '”', '’', '）', '】', '》', '」',
+                                '』', '〕', '〉', '》', '〗', '〞', '〟', '»', '"', "'", ')', ']', '}']:
+                    contents.append(new_text)
+                    new_text = ''
+            if new_text:
+                contents.append(new_text)
+        lang = detect('\n'.join(contents))
+        return contents, lang[0:2]
+def extract_text_from_txt(file_path: str) -> tuple[list[str], str]:
+    """Extract text content from a TXT file."""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        contents = [text.strip() for text in f.readlines() if text.strip()]
+        lang = detect('\n'.join(contents))
+        return contents, lang[0:2]
+def extract_text_from_docx(file_path: str) -> tuple[list[str], str]:
+    """Extract text content from a DOCX file."""
+    document = docx.Document(file_path)
+    contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
+    lang = detect('\n'.join(contents))
+    return contents, lang[0:2]

index/bkp/dd771cb6c4718ace4c4c596f4792cfdd.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c69dbad0041bb5e8e4b1e26793f8f38b56f5abb7c2db2dad201d13ce3a041d1
+size 3408298

index/bkp/dd771cb6c4718ace4c4c596f4792cfdd.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

index/dd771cb6c4718ace4c4c596f4792cfdd.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c69dbad0041bb5e8e4b1e26793f8f38b56f5abb7c2db2dad201d13ce3a041d1
+size 3408298

index/dd771cb6c4718ace4c4c596f4792cfdd.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from api import api
+from config import Config
+from console import console
+from webui import webui
+def run():
+    """Run the program."""
+    cfg = Config()
+    mode = cfg.mode
+    if mode == 'console':
+        console(cfg)
+    elif mode == 'api':
+        api(cfg)
+    elif mode == 'webui':
+        webui(cfg)
+    else:
+        raise ValueError('mode must be console or api')
+if __name__ == '__main__':
+    run()

requirements.txt ADDED Viewed

	@@ -0,0 +1,79 @@

+aiohttp
+aiosignal
+anyio
+async-generator
+async-timeout
+attrs
+beautifulsoup4
+certifi
+cffi
+chardet
+charset-normalizer
+click
+colorama
+cssselect
+exceptiongroup
+faiss-cpu
+fastapi
+feedfinder2
+feedparser
+filelock
+frozenlist
+greenlet
+h11
+httptools
+idna
+jieba3k
+joblib
+langdetect
+lxml
+multidict
+newspaper3k
+nltk
+numpy
+openai
+outcome
+pandas
+pgvector
+Pillow
+pycparser
+pydantic
+PyPDF2
+PySocks
+python-dateutil
+python-docx
+python-dotenv
+python-multipart
+pytz
+PyYAML
+readability-lxml
+regex
+requests
+requests-file
+scikit-learn
+scipy
+selenium
+sgmllib3k
+six
+sniffio
+sortedcontainers
+soupsieve
+SQLAlchemy
+starlette
+threadpoolctl
+tiktoken
+tinysegmenter
+tldextract
+tqdm
+trio
+trio-websocket
+typing_extensions
+urllib3
+uvicorn
+watchfiles
+websockets
+wsproto
+xxhash
+yarl
+gradio
+psycopg2

storage.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os.path
+from abc import ABC, abstractmethod
+import faiss
+import numpy as np
+import pandas as pd
+from pgvector.sqlalchemy import Vector
+from sqlalchemy import create_engine, Column, Integer, String
+from sqlalchemy.orm import sessionmaker, declarative_base
+from config import Config
+Base = declarative_base()
+class Storage(ABC):
+    """Abstract Storage class."""
+    # factory method
+    @staticmethod
+    def create_storage(cfg: Config) -> 'Storage':
+        """Create a storage object."""
+        if cfg.use_postgres:
+            return _PostgresStorage(cfg)
+        else:
+            return _IndexStorage(cfg)
+    @abstractmethod
+    def add_all(self, embeddings: list[tuple[str, list[float]]], name: str):
+        """Add multiple embeddings."""
+        pass
+    @abstractmethod
+    def get_texts(self, embedding: list[float], name: str, limit=100) -> list[str]:
+        """Get the text for the provided embedding."""
+        pass
+    @abstractmethod
+    def get_all_embeddings(self, name: str):
+        """Get all embeddings."""
+        pass
+    @abstractmethod
+    def clear(self, name: str):
+        """Clear the database."""
+        pass
+    @abstractmethod
+    def been_indexed(self, name: str) -> bool:
+        """Check if the database has been indexed."""
+        pass
+class _IndexStorage(Storage):
+    """IndexStorage class."""
+    def __init__(self, cfg: Config):
+        """Initialize the storage."""
+        self._cfg = cfg
+    def add_all(self, embeddings: list[tuple[str, list[float]]], name):
+        """Add multiple embeddings."""
+        texts, index = self._load(name)
+        ids = np.array([len(texts) + i for i, _ in enumerate(embeddings)])
+        texts = pd.concat([texts, pd.DataFrame(
+            {'index': len(texts) + i, 'text': text} for i, (text, _) in enumerate(embeddings))])
+        array = np.array([emb for text, emb in embeddings])
+        index.add_with_ids(array, ids)
+        self._save(texts, index, name)
+    def get_texts(self, embedding: list[float], name: str, limit=100) -> list[str]:
+        """Get the text for the provided embedding."""
+        texts, index = self._load(name)
+        _, indexs = index.search(np.array([embedding]), limit)
+        indexs = [i for i in indexs[0] if i >= 0]
+        return [f'paragraph {p}: {t}' for _, p, t in texts.iloc[indexs].values]
+    def get_all_embeddings(self, name: str):
+        texts, index = self._load(name)
+        texts = texts.text.tolist()
+        embeddings = index.reconstruct_n(0, len(texts))
+        return list(zip(texts, embeddings))
+    def clear(self, name: str):
+        """Clear the database."""
+        self._delete(name)
+    def been_indexed(self, name: str) -> bool:
+        return os.path.exists(os.path.join(self._cfg.index_path, f'{name}.csv')) and os.path.exists(
+            os.path.join(self._cfg.index_path, f'{name}.bin'))
+    def _save(self, texts, index, name: str):
+        texts.to_csv(os.path.join(self._cfg.index_path, f'{name}.csv'))
+        faiss.write_index(index, os.path.join(self._cfg.index_path, f'{name}.bin'))
+    def _load(self, name: str):
+        if self.been_indexed(name):
+            texts = pd.read_csv(os.path.join(self._cfg.index_path, f'{name}.csv'))
+            index = faiss.read_index(os.path.join(self._cfg.index_path, f'{name}.bin'))
+        else:
+            texts = pd.DataFrame(columns=['index', 'text'])
+            # IDMap2 with Flat
+            index = faiss.index_factory(1536, "IDMap2,Flat", faiss.METRIC_INNER_PRODUCT)
+        return texts, index
+    def _delete(self, name: str):
+        try:
+            os.remove(os.path.join(self._cfg.index_path, f'{name}.csv'))
+            os.remove(os.path.join(self._cfg.index_path, f'{name}.bin'))
+        except FileNotFoundError:
+            pass
+def singleton(cls):
+    instances = {}
+    def get_instance(cfg):
+        if cls not in instances:
+            instances[cls] = cls(cfg)
+        return instances[cls]
+    return get_instance
+@singleton
+class _PostgresStorage(Storage):
+    """PostgresStorage class."""
+    def __init__(self, cfg: Config):
+        """Initialize the storage."""
+        self._postgresql = cfg.postgres_url
+        self._engine = create_engine(self._postgresql)
+        Base.metadata.create_all(self._engine)
+        session = sessionmaker(bind=self._engine)
+        self._session = session()
+    def add_all(self, embeddings: list[tuple[str, list[float]]], name: str):
+        """Add multiple embeddings."""
+        data = [self.EmbeddingEntity(text=text, embedding=embedding, name=name) for text, embedding in embeddings]
+        self._session.add_all(data)
+        self._session.commit()
+    def get_texts(self, embedding: list[float], name: str, limit=100) -> list[str]:
+        """Get the text for the provided embedding."""
+        result = self._session.query(self.EmbeddingEntity).where(self.EmbeddingEntity.name == name).order_by(
+            self.EmbeddingEntity.embedding.cosine_distance(embedding)).limit(limit).all()
+        return [f'paragraph {s.id}: {s.text}' for s in result]
+    def get_all_embeddings(self, name: str):
+        """Get all embeddings."""
+        result = self._session.query(self.EmbeddingEntity).where(self.EmbeddingEntity.name == name).all()
+        return [(s.text, s.embedding) for s in result]
+    def clear(self, name: str):
+        """Clear the database."""
+        self._session.query(self.EmbeddingEntity).where(self.EmbeddingEntity.name == name).delete()
+        self._session.commit()
+    def been_indexed(self, name: str) -> bool:
+        return self._session.query(self.EmbeddingEntity).filter_by(name=name).first() is not None
+    def __del__(self):
+        """Close the session."""
+        self._session.close()
+    class EmbeddingEntity(Base):
+        __tablename__ = 'embedding'
+        id = Column(Integer, primary_key=True)
+        name = Column(String)
+        text = Column(String)
+        embedding = Column(Vector(1536))

webui.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+import xxhash
+from gradio.components import _Keywords
+from ai import AI
+from config import Config
+from contents import *
+from storage import Storage
+def webui(cfg: Config):
+    """Run the web UI."""
+    Webui(cfg).run()
+class Webui:
+    def __init__(self, cfg: Config):
+        self.cfg = cfg
+        self.ai = AI(cfg)
+        self.storage = Storage.create_storage(self.cfg)  # Initialize storage here
+    def _save_to_storage(self, contents, hash_id):
+        print(f"Saving to storage {hash_id}")
+        print(f"Contents: \n{contents}")
+        self.storage = Storage.create_storage(self.cfg)
+        if self.storage.been_indexed(hash_id):
+            return 0
+        else:
+            embeddings, tokens = self.ai.create_embeddings(contents)
+            self.storage.add_all(embeddings, hash_id)
+            return tokens
+    def _get_hash_id(self, contents):
+        return xxhash.xxh3_128_hexdigest('\n'.join(contents))
+    def run(self):
+        with gr.Blocks(theme=gr.themes.Monochrome(), css="footer {visibility: hidden}") as demo:
+            hash_id_state = gr.State('dd771cb6c4718ace4c4c596f4792cfdd')  # Initialize hash_id_state to 'dd771cb6c4718ace4c4c596f4792cfdd'
+            chat_page = gr.Column(visible=True)  # Set chat_page to visible by default
+            with chat_page:
+                with gr.Row():
+                    with gr.Column():
+                        chatbot = gr.Chatbot(label="Kanunla Konuş")
+                        msg = gr.Textbox(label="Sorunuzu Yazın (Bu deneysel bir projedir, tam ve doğru bilgi için, uzmanlarımıza danışın)")
+                        submit_box = gr.Button("Kanuna Sor", variant="primary")
+                def respond(message, chat_history, hash_id):
+                    kw = self.ai.get_keywords(message)
+                    if len(kw) == 0 or hash_id is None:
+                        return "", chat_history
+                    _, kw_ebd = self.ai.create_embedding(kw)
+                    ctx = self.storage.get_texts(kw_ebd, hash_id)
+                    print(f"Context: \n{ctx}")
+                    bot_message = self.ai.completion(message, ctx)
+                    chat_history.append((message, bot_message))
+                    return "", chat_history, \
+                def reset():
+                    return {
+                        chat_page: gr.update(visible=True),
+                        chatbot: gr.update(value=[]),
+                        msg: gr.update(value=""),
+                        hash_id_state: 'dd771cb6c4718ace4c4c596f4792cfdd',
+                    }
+                msg.submit(respond, [msg, chatbot, hash_id_state], [msg, chatbot])
+                submit_box.click(respond, [msg, chatbot, hash_id_state], [msg, chatbot])
+        demo.title = "Kanuna Sor"
+        demo.launch(server_port=self.cfg.webui_port, server_name=self.cfg.webui_host, show_api=False)