Spaces:

maximka608
/

NLP

Sleeping

App Files Files Community

maximka608 commited on Nov 26, 2024

Commit

b93b2dc

1 Parent(s): c621bf1

test

Browse files

Files changed (20) hide show

.DS_Store +0 -0
.gitattributes +2 -0
.idea/.gitignore +8 -0
.idea/inspectionProfiles/Project_Default.xml +13 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +7 -0
.idea/modules.xml +8 -0
.idea/nlp.iml +8 -0
.idea/vcs.xml +7 -0
app.py +86 -0
config.py +10 -0
faiss_index.faiss +3 -0
metadata.json +3 -0
preprocessing_text.json +3 -0
requirements.txt +112 -0
script/create_vector_base.py +41 -0
script/preprocessing_text.py +36 -0
utils/embedding.py +14 -0
utils/llm.py +28 -0
utils/vector_base.py +27 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.faiss filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,13 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N806" />
+          <option value="N803" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.10 (papersRag)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (papersRag)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/nlp.iml" filepath="$PROJECT_DIR$/.idea/nlp.iml" />
+    </modules>
+  </component>
+</project>

.idea/nlp.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (papersRag)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import streamlit as st
+from utils.vector_base import KnowledgeBase
+from utils.embedding import Embeddings
+from utils.llm import LLM
+from config import config
+import json
+def get_emdedding_model():
+    return Embeddings()
+def get_llm(url, api_key):
+    return LLM(url, api_key)
+def get_metadata(path):
+    titles, texts = [], []
+    with open(path, 'rb') as file:
+        metadata = json.load(file)
+        for data in metadata:
+            titles.append(data['title'])
+            texts.append(data['text'])
+    return texts, titles
+def combine_docs(indexes, texts):
+    result = ""
+    for i, index in enumerate(indexes):
+        result += " [" + str(i + 1) + "] " + texts[index]
+    return result
+def create_prompt(query, docs):
+    system_prompt = f"""You are a language model integrated into a search and
+    generation system based on relevant documents (RAG system).
+    Your task is to provide answers to the user's queries based on the provided
+    documents. Respond only based on the provided documents. Do not make up
+    information that is not in the sources. If you use data from a document,
+    indicate the document number in square brackets. For example: "This term
+    means such-and-such [1]." If there is no information in the documents,
+    politely explain that the information is not available. Do not alter the
+    content of the sources, convey the information accurately.
+    User query: {query}. Documents: {docs}
+    """
+    return system_prompt
+st.title("PaperRAG")
+st.write("RAG system for scientific papers with selectable search types")
+query = st.text_input("Enter your query", "")
+search_types = st.multiselect(
+    "Search Types", options=["Vector", "BM25"], default=["Vector", "BM25"]
+)
+llm_url = st.text_input("LLM URL", "", placeholder="Enter LLM ENDPOINT")
+llm_api_key = st.text_input("LLM API Key", "", placeholder="Enter LLM API Key", type="password")
+if st.button("Search"):
+    if query and llm_url and llm_api_key:
+        model = get_emdedding_model()
+        llm = get_llm(llm_url, llm_api_key)
+        texts, titles = get_metadata(config.PATH_METADATA)
+        embedding = model.get_query_embedding(query)
+        knowledge_base = KnowledgeBase(config.PATH_FAISS, config.PATH_PREPROCESSING_TEXT)
+        vector_search = []
+        bm25_search = []
+        if "Vector" in search_types:
+            vector_search = knowledge_base.search_by_embedding(embedding, 5)[0].tolist()
+        if "BM25" in search_types:
+            bm25_search = knowledge_base.search_by_BM25(query, 5)
+        docs = combine_docs(vector_search + bm25_search, texts)
+        prompt = create_prompt(query, docs)
+        response = llm.generate_response(prompt)
+        st.subheader("Response")
+        st.write(response)
+    else:
+        st.error("Please fill in all the required fields.")

config.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from pathlib import Path
+class Config:
+    PATH_FAISS = str(Path(__file__).parent / 'faiss_index.faiss')
+    PATH_METADATA = str(Path(__file__).parent / 'metadata.json')
+    PATH_PREPROCESSING_TEXT = str(Path(__file__).parent / 'preprocessing_text.json')
+config = Config()

faiss_index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1422951bc60fbc02da260a6d9059740149b8724e13f71b7110e440e66bcc9f79
+size 76847661

metadata.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:052c218b62d563adf9d26339d58c1296f22e6674f36f3b55e3675c3865e50d8f
+size 17923018

preprocessing_text.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51ccf934a26b90ca2d1753d51dd3e5e5498121ba7f661ce14a232a9993667bdf
+size 8317837

requirements.txt ADDED Viewed

	@@ -0,0 +1,112 @@

+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp==3.11.2
+aiosignal==1.3.1
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.6.2.post1
+async-timeout==5.0.1
+attrs==24.2.0
+blinker==1.9.0
+Brotli==1.1.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.1.7
+cryptography==43.0.3
+datasets==3.1.0
+dill==0.3.8
+distro==1.9.0
+einops==0.8.0
+exceptiongroup==1.2.2
+faiss-cpu==1.9.0
+fastapi==0.115.5
+ffmpy==0.4.0
+filelock==3.16.1
+frozenlist==1.5.0
+fsspec==2024.9.0
+gitdb==4.0.11
+GitPython==3.1.43
+gradio==5.6.0
+gradio_client==1.4.3
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.27.2
+huggingface-hub==0.26.2
+idna==3.10
+Jinja2==3.1.4
+jiter==0.7.1
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+narwhals==1.14.2
+networkx==3.4.2
+nltk==3.9.1
+numpy==2.1.3
+openai==1.54.4
+orjson==3.10.11
+packaging==24.2
+pandas==2.2.3
+pdfminer.six==20231228
+pdfplumber==0.11.4
+pillow==11.0.0
+propcache==0.2.0
+protobuf==5.28.3
+pyarrow==18.0.0
+pycparser==2.22
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.18.0
+pypdfium2==4.30.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.12
+pytz==2024.2
+PyYAML==6.0.2
+rank-bm25==0.2.2
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.21.0
+ruff==0.7.4
+safehttpx==0.1.1
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+semantic-version==2.10.0
+sentence-transformers==3.3.1
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+starlette==0.41.3
+sympy==1.13.1
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tokenizers==0.20.3
+toml==0.10.2
+tomlkit==0.12.0
+torch==2.5.1
+torchaudio==2.5.1
+torchvision==0.20.1
+tornado==6.4.2
+tqdm==4.67.0
+transformers==4.46.3
+typer==0.13.1
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+uvicorn==0.32.1
+websockets==12.0
+xxhash==3.5.0
+yarl==1.17.2

script/create_vector_base.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import faiss, json
+from datasets import load_dataset
+from utils.embedding import Embeddings
+def get_chunkes(docs, size):
+    chunked_texts, metadata= [], []
+    for _, text in enumerate(docs):
+        for i in range(0, len(text['abstract']), size):
+            chunk = text['abstract'][i:i + size]
+            chunked_texts.append(chunk)
+            metadata.append({'title': text['title'], 'text': chunk})
+    return chunked_texts, metadata
+def create_base(docs, model: Embeddings):
+    chunks, metadata = get_chunkes(docs, 256)
+    dimension = 384
+    embeddings = model.get_embeddings(chunks)
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings)
+    return index, metadata
+def main():
+    data = load_dataset("aalksii/ml-arxiv-papers")
+    articles = data['train'].select(range(10000))
+    embed_model = Embeddings()
+    vector_base, metadata = create_base(articles, embed_model)
+    faiss.write_index(vector_base, "faiss_index.faiss")
+    with open("../metadata.json", "w") as f:
+        json.dump(metadata, f, indent=4)
+if __name__ == '__main__':
+    main()

script/preprocessing_text.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import nltk, json
+from nltk.stem import PorterStemmer
+from nltk.corpus import stopwords
+# from app.main import get_metadata
+from config import config
+class Preprocessor:
+    def _tokenize(self, text):
+        text = text.lower().split(' ')
+        return text
+    def preprocessing_text(self, doc):
+        tokens = self._tokenize(doc)
+        nltk.download('stopwords')
+        stop_words = set(stopwords.words('english'))
+        filtered_tokens = [token for token in tokens if not token in stop_words]
+        stemmer = PorterStemmer()
+        stemmed_tokes = [stemmer.stem(filtered_token) for filtered_token in filtered_tokens]
+        preprocess_text =  " ".join(stemmed_tokes)
+        return preprocess_text
+    def _save(self, docs):
+        with open("../preprocessing_text.json", "w") as f:
+            json.dump(docs, f, indent=4)
+    def preprocessing(self, docs):
+        preprocessed_docs = [self.preprocessing_text(doc) for doc in docs]
+        self._save(preprocessed_docs)
+if __name__ == '__main__':
+    texts, _ = get_metadata(config.PATH_METADATA)
+    preprocessor = Preprocessor()
+    preprocessor.preprocessing(texts)

utils/embedding.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from sentence_transformers import SentenceTransformer
+class Embeddings:
+    def __init__(self, model_name: str = 'BAAI/bge-small-en-v1.5'):
+        self.model = SentenceTransformer(model_name, trust_remote_code=True, revision="main")
+    def get_query_embedding(self, query):
+        query_embed = self.model.encode([query], normalize_embeddings=True)
+        return query_embed
+    def get_embeddings(self, texts):
+        embeddings = self.model.encode(texts, normalize_embeddings=True)
+        return embeddings

utils/llm.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import requests
+from dotenv import load_dotenv
+load_dotenv()
+class LLM:
+    def __init__(self, url, api_key):
+        self.endpoint = url
+        self.api_key = api_key
+    def generate_response(self, prompt):
+        headers = {
+            "Content-Type": "application/json",
+            "api-key": self.api_key,
+        }
+        data = {
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": 1500,
+            "temperature": 0.5,
+        }
+        response = requests.post(self.endpoint, headers=headers, json=data)
+        if response.status_code == 200:
+            return response.json()["choices"][0]["message"]["content"]
+        else:
+            return ValueError(response.text)

utils/vector_base.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import faiss, json
+from script.preprocessing_text import Preprocessor
+from rank_bm25 import BM25Okapi
+import numpy as np
+class KnowledgeBase:
+    def __init__(self, faiss_path, preprocessing_path) -> None:
+        self.BM25_model = BM25Okapi(self._load(preprocessing_path))
+        self.vector_base = faiss.read_index(faiss_path)
+    def _load(self, path):
+        with open(path, 'rb') as file:
+            data = json.load(file)
+            return data
+    def search_by_BM25(self, query, k=5):
+        preprocessor = Preprocessor()
+        prep_query = preprocessor.preprocessing_text(query)
+        doc_scores = self.BM25_model.get_scores(prep_query)
+        sorted_docs = np.argsort(-doc_scores)
+        return sorted_docs[:k].tolist()
+    def search_by_embedding(self, embedding, k):
+        _, indexes = self.vector_base.search(embedding, k)
+        return indexes