Spaces:
Sleeping
Sleeping
Teapack1
commited on
Commit
·
99afe26
0
Parent(s):
initial
Browse files- .gitattributes +35 -0
- .gitignore +13 -0
- README.md +13 -0
- app.py +4 -0
- fast_app.py +146 -0
- fast_app_cz(obsolete).py +110 -0
- ingest(obsolete).py +59 -0
- ingest.py +127 -0
- requirements.txt +15 -0
- support/images-text.py +60 -0
- support/process_archives.py +87 -0
- templates/index.html +316 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/libpff-20231205
|
2 |
+
/libpff-alpha-20231205.tar.gz
|
3 |
+
/venv
|
4 |
+
/.chainlit
|
5 |
+
/model
|
6 |
+
/__pycache__
|
7 |
+
.files
|
8 |
+
*.gguf
|
9 |
+
*.pst
|
10 |
+
/data
|
11 |
+
*.jpg
|
12 |
+
/*.env
|
13 |
+
/*.log
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: RAG Retrieve Ingest Cz Eng
|
3 |
+
emoji: 😻
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.19.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
|
3 |
+
print("Starting the FastAPI server...")
|
4 |
+
subprocess.run("uvicorn fast_app:app --host 0.0.0.0 --port 7860", shell=True)
|
fast_app.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
from fastapi import FastAPI, Request, Form, Response
|
5 |
+
from fastapi.responses import HTMLResponse
|
6 |
+
from fastapi.templating import Jinja2Templates
|
7 |
+
from fastapi.staticfiles import StaticFiles
|
8 |
+
from fastapi.encoders import jsonable_encoder
|
9 |
+
|
10 |
+
from langchain.vectorstores import Chroma
|
11 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
+
|
13 |
+
from langchain.chains import RetrievalQA
|
14 |
+
from langchain.document_loaders import (
|
15 |
+
TextLoader,
|
16 |
+
PyPDFLoader,
|
17 |
+
DirectoryLoader,
|
18 |
+
UnstructuredFileLoader,
|
19 |
+
)
|
20 |
+
from langchain.document_loaders.csv_loader import CSVLoader
|
21 |
+
from langchain.llms import OpenAI
|
22 |
+
from langchain import PromptTemplate
|
23 |
+
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
|
24 |
+
|
25 |
+
from ingest import Ingest
|
26 |
+
|
27 |
+
# setx OPENAI_API_KEY "your_openai_api_key_here"
|
28 |
+
|
29 |
+
# Access the Hugging Face API token from an environment variable
|
30 |
+
# huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
31 |
+
# if huggingface_token is None:
|
32 |
+
# raise ValueError("Hugging Face token is not set in environment variables.")
|
33 |
+
|
34 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
35 |
+
if openai_api_key is None:
|
36 |
+
raise ValueError("OAI token is not set in environment variables.")
|
37 |
+
|
38 |
+
|
39 |
+
app = FastAPI()
|
40 |
+
templates = Jinja2Templates(directory="templates")
|
41 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
42 |
+
|
43 |
+
czech_store = "stores/czech_512"
|
44 |
+
english_store = "stores/english_256"
|
45 |
+
|
46 |
+
ingestor = Ingest(
|
47 |
+
openai_api_key=openai_api_key,
|
48 |
+
chunk=256,
|
49 |
+
overlap=128,
|
50 |
+
czech_store=czech_store,
|
51 |
+
english_store=english_store,
|
52 |
+
)
|
53 |
+
|
54 |
+
load_dotenv()
|
55 |
+
|
56 |
+
prompt_template = """You are a electrical engineer focused on lighting and chandeliers. Provide helpful answer to the user question.
|
57 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
58 |
+
|
59 |
+
Context: {context}
|
60 |
+
Question: {question}
|
61 |
+
|
62 |
+
Only return the helpful answer below and nothing else.
|
63 |
+
Helpful answer:
|
64 |
+
"""
|
65 |
+
|
66 |
+
prompt = PromptTemplate(
|
67 |
+
template=prompt_template, input_variables=["context", "question"]
|
68 |
+
)
|
69 |
+
|
70 |
+
print("\n Prompt ready... \n\n")
|
71 |
+
|
72 |
+
|
73 |
+
@app.get("/", response_class=HTMLResponse)
|
74 |
+
def read_item(request: Request):
|
75 |
+
return templates.TemplateResponse("index.html", {"request": request})
|
76 |
+
|
77 |
+
|
78 |
+
@app.post("/ingest_data")
|
79 |
+
async def ingest_data(folderPath: str = Form(...), language: str = Form(...)):
|
80 |
+
# Determine the correct data path and store based on the language
|
81 |
+
if language == "czech":
|
82 |
+
print("\n Czech language selected....\n\n")
|
83 |
+
ingestor.data_czech = folderPath
|
84 |
+
ingestor.ingest_czech()
|
85 |
+
message = "Czech data ingestion complete."
|
86 |
+
else:
|
87 |
+
print("\n English language selected....\n\n")
|
88 |
+
ingestor.data_english = folderPath
|
89 |
+
ingestor.ingest_english()
|
90 |
+
message = "English data ingestion complete."
|
91 |
+
|
92 |
+
return {"message": message}
|
93 |
+
|
94 |
+
|
95 |
+
@app.post("/get_response")
|
96 |
+
async def get_response(query: str = Form(...), language: str = Form(...)):
|
97 |
+
print(language)
|
98 |
+
if language == "czech":
|
99 |
+
print("\n Czech language selected....\n\n")
|
100 |
+
embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
|
101 |
+
persist_directory = czech_store
|
102 |
+
model_name = embedding_model
|
103 |
+
model_kwargs = {"device": "cpu"}
|
104 |
+
encode_kwargs = {"normalize_embeddings": False}
|
105 |
+
embedding = HuggingFaceEmbeddings(
|
106 |
+
model_name=model_name,
|
107 |
+
model_kwargs=model_kwargs,
|
108 |
+
encode_kwargs=encode_kwargs,
|
109 |
+
)
|
110 |
+
else:
|
111 |
+
print("\n English language selected....\n\n")
|
112 |
+
embedding_model = "text-embedding-3-large" # Default to English
|
113 |
+
persist_directory = english_store
|
114 |
+
embedding = OpenAIEmbeddings(
|
115 |
+
openai_api_key=openai_api_key,
|
116 |
+
model=embedding_model,
|
117 |
+
)
|
118 |
+
|
119 |
+
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
|
120 |
+
retriever = vectordb.as_retriever(search_kwargs={"k": 10})
|
121 |
+
|
122 |
+
chain_type_kwargs = {"prompt": prompt}
|
123 |
+
qa_chain = RetrievalQA.from_chain_type(
|
124 |
+
llm=OpenAI(openai_api_key=openai_api_key),
|
125 |
+
chain_type="stuff",
|
126 |
+
retriever=retriever,
|
127 |
+
return_source_documents=True,
|
128 |
+
chain_type_kwargs=chain_type_kwargs,
|
129 |
+
verbose=True,
|
130 |
+
)
|
131 |
+
response = qa_chain(query)
|
132 |
+
|
133 |
+
for i in response["source_documents"]:
|
134 |
+
print(f"\n{i}\n\n")
|
135 |
+
|
136 |
+
print(response)
|
137 |
+
|
138 |
+
answer = response["result"]
|
139 |
+
source_document = response["source_documents"][0].page_content
|
140 |
+
doc = response["source_documents"][0].metadata["source"]
|
141 |
+
response_data = jsonable_encoder(
|
142 |
+
json.dumps({"answer": answer, "source_document": source_document, "doc": doc})
|
143 |
+
)
|
144 |
+
|
145 |
+
res = Response(response_data)
|
146 |
+
return res
|
fast_app_cz(obsolete).py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
from fastapi import FastAPI, Request, Form, Response
|
5 |
+
from fastapi.responses import HTMLResponse
|
6 |
+
from fastapi.templating import Jinja2Templates
|
7 |
+
from fastapi.staticfiles import StaticFiles
|
8 |
+
from fastapi.encoders import jsonable_encoder
|
9 |
+
from langchain.llms import CTransformers
|
10 |
+
|
11 |
+
from langchain.vectorstores import Chroma
|
12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
+
|
14 |
+
from langchain.chains import RetrievalQA
|
15 |
+
from langchain.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader
|
16 |
+
from langchain.llms import OpenAI
|
17 |
+
from langchain import PromptTemplate
|
18 |
+
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
|
19 |
+
|
20 |
+
app = FastAPI()
|
21 |
+
load_dotenv()
|
22 |
+
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
23 |
+
templates = Jinja2Templates(directory="templates")
|
24 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
25 |
+
# embedding_model = "Seznam/simcse-dist-mpnet-czeng-cs-en"
|
26 |
+
embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
|
27 |
+
persist_directory = "stores/seznampara_ul_512"
|
28 |
+
|
29 |
+
llm = OpenAI(openai_api_key=openai_api_key)
|
30 |
+
# llm = "model\dolphin-2.6-mistral-7b.Q4_K_S.gguf"
|
31 |
+
# llm = "neural-chat-7b-v3-1.Q4_K_M.gguf"
|
32 |
+
|
33 |
+
|
34 |
+
"""
|
35 |
+
### - Local LLM settings - ###
|
36 |
+
|
37 |
+
config = {
|
38 |
+
"max_new_tokens": 1024,
|
39 |
+
"repetition_penalty": 1.1,
|
40 |
+
"temperature": 0.1,
|
41 |
+
"top_k": 50,
|
42 |
+
"top_p": 0.9,
|
43 |
+
"stream": True,
|
44 |
+
"threads": int(os.cpu_count() / 2),
|
45 |
+
}
|
46 |
+
|
47 |
+
llm = CTransformers(
|
48 |
+
model=llm, model_type="mistral", lib="avx2", **config # for CPU use
|
49 |
+
)
|
50 |
+
|
51 |
+
### - Local LLM settings end - ###
|
52 |
+
"""
|
53 |
+
|
54 |
+
prompt_template = """Use the following pieces of information to answer the user's question.
|
55 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
56 |
+
|
57 |
+
Context: {context}
|
58 |
+
Question: {question}
|
59 |
+
|
60 |
+
Only return the helpful answer below and nothing else.
|
61 |
+
Helpful answer:
|
62 |
+
"""
|
63 |
+
|
64 |
+
prompt = PromptTemplate(
|
65 |
+
template=prompt_template, input_variables=["context", "question"]
|
66 |
+
)
|
67 |
+
|
68 |
+
print("\n Prompt ready... \n\n")
|
69 |
+
|
70 |
+
|
71 |
+
model_name = embedding_model
|
72 |
+
model_kwargs = {"device": "cpu"}
|
73 |
+
encode_kwargs = {"normalize_embeddings": False}
|
74 |
+
embedding = HuggingFaceEmbeddings(
|
75 |
+
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
76 |
+
)
|
77 |
+
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
|
78 |
+
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
|
79 |
+
|
80 |
+
print("\n Retrieval Ready....\n\n")
|
81 |
+
|
82 |
+
|
83 |
+
@app.get("/", response_class=HTMLResponse)
|
84 |
+
def read_item(request: Request):
|
85 |
+
return templates.TemplateResponse("index.html", {"request": request})
|
86 |
+
|
87 |
+
|
88 |
+
@app.post("/get_response")
|
89 |
+
async def get_response(query: str = Form(...)):
|
90 |
+
|
91 |
+
chain_type_kwargs = {"prompt": prompt}
|
92 |
+
qa_chain = RetrievalQA.from_chain_type(
|
93 |
+
llm=llm,
|
94 |
+
chain_type="stuff",
|
95 |
+
retriever=retriever,
|
96 |
+
return_source_documents=True,
|
97 |
+
chain_type_kwargs=chain_type_kwargs,
|
98 |
+
verbose=True,
|
99 |
+
)
|
100 |
+
response = qa_chain(query)
|
101 |
+
print(response)
|
102 |
+
answer = response["result"]
|
103 |
+
source_document = response["source_documents"][0].page_content
|
104 |
+
doc = response["source_documents"][0].metadata["source"]
|
105 |
+
response_data = jsonable_encoder(
|
106 |
+
json.dumps({"answer": answer, "source_document": source_document, "doc": doc})
|
107 |
+
)
|
108 |
+
|
109 |
+
res = Response(response_data)
|
110 |
+
return res
|
ingest(obsolete).py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.vectorstores import Chroma
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
|
4 |
+
from langchain.document_loaders import (
|
5 |
+
PyPDFLoader,
|
6 |
+
DirectoryLoader,
|
7 |
+
UnstructuredFileLoader,
|
8 |
+
)
|
9 |
+
from langchain.document_loaders.csv_loader import CSVLoader
|
10 |
+
from langchain.embeddings import (
|
11 |
+
OpenAIEmbeddings,
|
12 |
+
HuggingFaceBgeEmbeddings,
|
13 |
+
HuggingFaceEmbeddings,
|
14 |
+
HuggingFaceInstructEmbeddings,
|
15 |
+
)
|
16 |
+
|
17 |
+
|
18 |
+
persist_directory = "stores/test_512"
|
19 |
+
data = "data\czech"
|
20 |
+
chunk = 512
|
21 |
+
overlap = 128
|
22 |
+
# embedding_model = "Seznam/simcse-dist-mpnet-czeng-cs-en"
|
23 |
+
embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
|
24 |
+
|
25 |
+
model_name = embedding_model
|
26 |
+
model_kwargs = {"device": "cpu"}
|
27 |
+
encode_kwargs = {"normalize_embeddings": False}
|
28 |
+
embedding = HuggingFaceEmbeddings(
|
29 |
+
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
30 |
+
)
|
31 |
+
|
32 |
+
"""
|
33 |
+
loader = CSVLoader(
|
34 |
+
file_path="data/emails.csv",
|
35 |
+
encoding="utf-8",
|
36 |
+
csv_args={
|
37 |
+
"delimiter": ";",
|
38 |
+
},
|
39 |
+
)
|
40 |
+
|
41 |
+
"""
|
42 |
+
|
43 |
+
loader = DirectoryLoader(data, show_progress=True)
|
44 |
+
|
45 |
+
|
46 |
+
documents = loader.load()
|
47 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
48 |
+
chunk_size=chunk,
|
49 |
+
chunk_overlap=overlap,
|
50 |
+
)
|
51 |
+
texts = text_splitter.split_documents(documents)
|
52 |
+
vectordb = Chroma.from_documents(
|
53 |
+
documents=texts,
|
54 |
+
embedding=embedding,
|
55 |
+
persist_directory=persist_directory,
|
56 |
+
collection_metadata={"hnsw:space": "cosine"},
|
57 |
+
)
|
58 |
+
|
59 |
+
print("\n Vector Store Created.......\n\n")
|
ingest.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.vectorstores import Chroma
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
|
4 |
+
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
5 |
+
from langchain.embeddings import (
|
6 |
+
OpenAIEmbeddings,
|
7 |
+
HuggingFaceBgeEmbeddings,
|
8 |
+
HuggingFaceEmbeddings,
|
9 |
+
HuggingFaceInstructEmbeddings,
|
10 |
+
)
|
11 |
+
|
12 |
+
|
13 |
+
class Ingest:
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
openai_api_key=None,
|
17 |
+
chunk=512,
|
18 |
+
overlap=256,
|
19 |
+
czech_store="stores/czech_512",
|
20 |
+
english_store="stores/english_512",
|
21 |
+
data_czech="data/czech",
|
22 |
+
data_english="data/english",
|
23 |
+
):
|
24 |
+
self.openai_api_key = openai_api_key
|
25 |
+
self.chunk = chunk
|
26 |
+
self.overlap = overlap
|
27 |
+
self.czech_store = czech_store
|
28 |
+
self.english_store = english_store
|
29 |
+
self.data_czech = data_czech
|
30 |
+
self.data_english = data_english
|
31 |
+
|
32 |
+
def ingest_english(self):
|
33 |
+
|
34 |
+
embedding = OpenAIEmbeddings(
|
35 |
+
openai_api_key=self.openai_api_key,
|
36 |
+
model="text-embedding-3-large",
|
37 |
+
)
|
38 |
+
|
39 |
+
loader = DirectoryLoader(
|
40 |
+
self.data_english,
|
41 |
+
show_progress=True,
|
42 |
+
)
|
43 |
+
|
44 |
+
documents = loader.load()
|
45 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
46 |
+
chunk_size=self.chunk,
|
47 |
+
chunk_overlap=self.overlap,
|
48 |
+
)
|
49 |
+
texts = text_splitter.split_documents(documents)
|
50 |
+
|
51 |
+
vectordb = Chroma.from_documents(
|
52 |
+
documents=texts,
|
53 |
+
embedding=embedding,
|
54 |
+
persist_directory=self.english_store,
|
55 |
+
collection_metadata={"hnsw:space": "cosine"},
|
56 |
+
)
|
57 |
+
|
58 |
+
print("\n English vector Store Created.......\n\n")
|
59 |
+
|
60 |
+
def ingest_czech(self):
|
61 |
+
embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
|
62 |
+
model_kwargs = {"device": "cpu"}
|
63 |
+
encode_kwargs = {"normalize_embeddings": False}
|
64 |
+
embedding = HuggingFaceEmbeddings(
|
65 |
+
model_name=embedding_model,
|
66 |
+
model_kwargs=model_kwargs,
|
67 |
+
encode_kwargs=encode_kwargs,
|
68 |
+
)
|
69 |
+
|
70 |
+
loader = DirectoryLoader(
|
71 |
+
self.data_czech,
|
72 |
+
show_progress=True,
|
73 |
+
)
|
74 |
+
|
75 |
+
documents = loader.load()
|
76 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
77 |
+
chunk_size=self.chunk,
|
78 |
+
chunk_overlap=self.overlap,
|
79 |
+
)
|
80 |
+
|
81 |
+
texts = text_splitter.split_documents(documents)
|
82 |
+
vectordb = Chroma.from_documents(
|
83 |
+
documents=texts,
|
84 |
+
embedding=embedding,
|
85 |
+
persist_directory=self.czech_store,
|
86 |
+
collection_metadata={"hnsw:space": "cosine"},
|
87 |
+
)
|
88 |
+
|
89 |
+
print("\n Czech vector Store Created.......\n\n")
|
90 |
+
|
91 |
+
|
92 |
+
"""
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
openai_api_key = "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"
|
97 |
+
persist_directory = "stores/store_512"
|
98 |
+
data = "data/"
|
99 |
+
chunk = 512
|
100 |
+
overlap = 256
|
101 |
+
|
102 |
+
embedding = OpenAIEmbeddings(
|
103 |
+
openai_api_key=openai_api_key,
|
104 |
+
model="text-embedding-3-large",
|
105 |
+
# model_kwargs={"device": "cpu"},
|
106 |
+
)
|
107 |
+
|
108 |
+
loader = DirectoryLoader(
|
109 |
+
data, glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
|
110 |
+
)
|
111 |
+
documents = loader.load()
|
112 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
113 |
+
chunk_size=chunk,
|
114 |
+
chunk_overlap=overlap,
|
115 |
+
)
|
116 |
+
texts = text_splitter.split_documents(documents)
|
117 |
+
|
118 |
+
vectordb = Chroma.from_documents(
|
119 |
+
documents=texts,
|
120 |
+
embedding=embedding,
|
121 |
+
persist_directory=persist_directory,
|
122 |
+
collection_metadata={"hnsw:space": "cosine"},
|
123 |
+
)
|
124 |
+
|
125 |
+
print("\n Vector Store Created.......\n\n")
|
126 |
+
|
127 |
+
"""
|
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
fastapi
|
3 |
+
uvicorn
|
4 |
+
python-multipart
|
5 |
+
ctransformers
|
6 |
+
qdrant-client
|
7 |
+
torch
|
8 |
+
sentence_transformers
|
9 |
+
chromadb
|
10 |
+
pytesseract
|
11 |
+
fitz
|
12 |
+
libpff-python
|
13 |
+
openai
|
14 |
+
tiktoken
|
15 |
+
frontend
|
support/images-text.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz # PyMuPDF
|
2 |
+
from PIL import Image
|
3 |
+
import pytesseract
|
4 |
+
import io
|
5 |
+
import os
|
6 |
+
|
7 |
+
def extract_images_from_pdf(pdf_path, output_folder, dpi=300):
|
8 |
+
doc = fitz.open(pdf_path)
|
9 |
+
images = []
|
10 |
+
for page_num in range(len(doc)):
|
11 |
+
page = doc.load_page(page_num)
|
12 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))
|
13 |
+
img_bytes = pix.tobytes("ppm")
|
14 |
+
|
15 |
+
# Convert PPM to JPEG using PIL and save
|
16 |
+
image = Image.open(io.BytesIO(img_bytes))
|
17 |
+
output_filename = f"{output_folder}/page_{page_num + 1}.jpg"
|
18 |
+
image.save(output_filename, "JPEG")
|
19 |
+
|
20 |
+
# Also store the image bytes for OCR processing
|
21 |
+
img_converted_bytes = io.BytesIO()
|
22 |
+
image.save(img_converted_bytes, format='JPEG')
|
23 |
+
images.append(img_converted_bytes.getvalue())
|
24 |
+
|
25 |
+
print(f"Images saved to {output_folder}")
|
26 |
+
return images
|
27 |
+
|
28 |
+
def ocr_images(images):
|
29 |
+
text_from_images = []
|
30 |
+
for image_bytes in images:
|
31 |
+
image = Image.open(io.BytesIO(image_bytes))
|
32 |
+
text = pytesseract.image_to_string(image, lang='ces')
|
33 |
+
text_from_images.append(text)
|
34 |
+
return text_from_images
|
35 |
+
|
36 |
+
def save_text_to_file(text, file_path):
|
37 |
+
with open(file_path, 'w', encoding='utf-8') as file:
|
38 |
+
file.write(text)
|
39 |
+
|
40 |
+
# Path to your PDF file and output locations
|
41 |
+
pdf_path = "norm.pdf"
|
42 |
+
output_folder = "extracted_images"
|
43 |
+
output_text_file = "ocr_results.txt"
|
44 |
+
|
45 |
+
# Ensure the output directory exists
|
46 |
+
os.makedirs(output_folder, exist_ok=True)
|
47 |
+
|
48 |
+
# Extract images from the PDF and save them as JPEG
|
49 |
+
images = extract_images_from_pdf(pdf_path, output_folder)
|
50 |
+
|
51 |
+
# Transcribe text from the extracted images with Czech language
|
52 |
+
transcribed_texts = ocr_images(images)
|
53 |
+
|
54 |
+
# Combine all texts into a single string
|
55 |
+
combined_text = "\n".join(transcribed_texts)
|
56 |
+
|
57 |
+
# Save OCR results to a text file
|
58 |
+
save_text_to_file(combined_text, output_text_file)
|
59 |
+
|
60 |
+
print(f"OCR results saved to {output_text_file}")
|
support/process_archives.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pypff
|
2 |
+
import re
|
3 |
+
from datetime import datetime
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
print(f"[INFO] pypff version: {pypff.get_version()}")
|
7 |
+
|
8 |
+
DATA_PATH = "data\Archiv_2023_2.pst"
|
9 |
+
|
10 |
+
# Patterns to match and remove
|
11 |
+
patterns = [
|
12 |
+
r"\bM\s\+\d{3}\d{3}\d{3}\d{3}\b", # Phone numbers
|
13 |
+
r"\bP\s\+\d{3}\d{3}\d{3}\d{3}\b", # Phone numbers
|
14 |
+
r"\S+@\S+", # Email addresses
|
15 |
+
r"http[s]?://\S+", # URLs
|
16 |
+
r"preciosalighting\.com\s*<",
|
17 |
+
r"Facebook\s*<", # Social Media links
|
18 |
+
r"Instagram\s*<",
|
19 |
+
r"Youtube\s*<",
|
20 |
+
r"Pinterest\s*<",
|
21 |
+
r"Linkedin\s*<",
|
22 |
+
r"_+", # Line of underscores
|
23 |
+
# Czech legal disclaimer
|
24 |
+
r"Tento e-mail je určen pouze.*od odesílatele k adresátovi\.",
|
25 |
+
# English legal disclaimer
|
26 |
+
r"This e-mail transmission is intended solely.*from the sender to the recipient\.",
|
27 |
+
r"From:.*\n?",
|
28 |
+
r"Sent:.*\n?",
|
29 |
+
r"To:.*\n?",
|
30 |
+
r"Cc:.*\n?",
|
31 |
+
r"Subject:.*\n?",
|
32 |
+
r";", # Semicolons
|
33 |
+
r"[^\w\s,.]",
|
34 |
+
]
|
35 |
+
|
36 |
+
|
37 |
+
def extract_emails(pst_file):
|
38 |
+
opened_pst = pypff.open(pst_file)
|
39 |
+
root = opened_pst.get_root_folder()
|
40 |
+
|
41 |
+
emails = []
|
42 |
+
|
43 |
+
def process_folder(folder):
|
44 |
+
for folder in folder.sub_folders:
|
45 |
+
process_folder(folder)
|
46 |
+
for message in folder.sub_messages:
|
47 |
+
emails.append(
|
48 |
+
{
|
49 |
+
"subject": message.subject,
|
50 |
+
"body": message.plain_text_body,
|
51 |
+
"sender": message.sender_name,
|
52 |
+
"date": message.delivery_time,
|
53 |
+
}
|
54 |
+
)
|
55 |
+
|
56 |
+
process_folder(root)
|
57 |
+
return emails
|
58 |
+
|
59 |
+
|
60 |
+
def format_item(item, patterns):
|
61 |
+
date = item["date"].strftime("%Y-%m-%d")
|
62 |
+
body = item["body"].decode("utf-8")
|
63 |
+
for pattern in patterns:
|
64 |
+
body = re.sub(pattern, "", body)
|
65 |
+
body = re.sub("\s+", " ", body).strip()
|
66 |
+
|
67 |
+
return {
|
68 |
+
"subject": item["subject"],
|
69 |
+
"body": body,
|
70 |
+
"sender": item["sender"],
|
71 |
+
"date": date,
|
72 |
+
}
|
73 |
+
|
74 |
+
|
75 |
+
def main():
|
76 |
+
dataset_list = []
|
77 |
+
emails = extract_emails(DATA_PATH)
|
78 |
+
for email in emails:
|
79 |
+
dataset_list.append(format_item(email, patterns))
|
80 |
+
|
81 |
+
df = pd.DataFrame(dataset_list)
|
82 |
+
df.head()
|
83 |
+
df.to_csv("data\emails.csv", index=True, header=True, sep=";")
|
84 |
+
|
85 |
+
|
86 |
+
if __name__ == "__main__":
|
87 |
+
main()
|
templates/index.html
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>RAG question answer bot</title>
|
7 |
+
<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@400;600&display=swap" rel="stylesheet">
|
8 |
+
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
|
9 |
+
<style>
|
10 |
+
body {
|
11 |
+
background-color: black;
|
12 |
+
font-family: 'Poppins', sans-serif;
|
13 |
+
color: white;
|
14 |
+
}
|
15 |
+
.chat-container {
|
16 |
+
max-width: 800px;
|
17 |
+
margin: 50px auto;
|
18 |
+
margin-top: 10%;
|
19 |
+
padding: 20px;
|
20 |
+
background-color: #333;
|
21 |
+
border-radius: 10px;
|
22 |
+
}
|
23 |
+
.chat-heading {
|
24 |
+
text-align: center;
|
25 |
+
font-size: 2.5em;
|
26 |
+
font-weight: 600;
|
27 |
+
margin-bottom: 30px;
|
28 |
+
color: #ffd700; /* Golden color for the heading */
|
29 |
+
}
|
30 |
+
.chat-input {
|
31 |
+
margin-top: 20px; /* Added margin */
|
32 |
+
margin-bottom: 20px;
|
33 |
+
height: 100px; /* Increased height */
|
34 |
+
}
|
35 |
+
.chat-button {
|
36 |
+
background-color: green;
|
37 |
+
color: white;
|
38 |
+
padding: 10px 20px;
|
39 |
+
font-size: 1.2em;
|
40 |
+
}
|
41 |
+
.chat-response {
|
42 |
+
background-color: #444;
|
43 |
+
padding: 15px;
|
44 |
+
border-radius: 5px;
|
45 |
+
min-height: 100px; /* Minimum height for the response box */
|
46 |
+
margin-top: 20px;
|
47 |
+
}
|
48 |
+
.accordion {
|
49 |
+
|
50 |
+
background-color: #444;
|
51 |
+
border-radius: 5px;
|
52 |
+
}
|
53 |
+
.accordion-button {
|
54 |
+
color: white;
|
55 |
+
background-color: #555;
|
56 |
+
}
|
57 |
+
.accordion-body {
|
58 |
+
color: white; /* Improved visibility of text */
|
59 |
+
}
|
60 |
+
pre {
|
61 |
+
white-space:pre-wrap;
|
62 |
+
}
|
63 |
+
#exampleQueries {
|
64 |
+
margin-top: 10px;
|
65 |
+
text-align: center;
|
66 |
+
}
|
67 |
+
|
68 |
+
#exampleQueries button {
|
69 |
+
margin-right: 10px;
|
70 |
+
margin-bottom: 5px;
|
71 |
+
}
|
72 |
+
|
73 |
+
|
74 |
+
/* The switch - the box around the slider */
|
75 |
+
|
76 |
+
.language-toggle {
|
77 |
+
text-align: right;
|
78 |
+
margin: 16px;
|
79 |
+
position: relative; /* Added position relative to align the label */
|
80 |
+
}
|
81 |
+
|
82 |
+
.switch {
|
83 |
+
position: relative;
|
84 |
+
display: inline-block;
|
85 |
+
width: 60px;
|
86 |
+
height: 34px;
|
87 |
+
}
|
88 |
+
|
89 |
+
/* Hide default HTML checkbox */
|
90 |
+
.switch input {
|
91 |
+
opacity: 0;
|
92 |
+
width: 0;
|
93 |
+
height: 0;
|
94 |
+
}
|
95 |
+
|
96 |
+
/* The slider */
|
97 |
+
.slider {
|
98 |
+
position: absolute;
|
99 |
+
cursor: pointer;
|
100 |
+
top: 0;
|
101 |
+
left: 0;
|
102 |
+
right: 0;
|
103 |
+
bottom: 0;
|
104 |
+
background-color: #ccc;
|
105 |
+
-webkit-transition: .4s;
|
106 |
+
transition: .4s;
|
107 |
+
}
|
108 |
+
|
109 |
+
.slider:before {
|
110 |
+
position: absolute;
|
111 |
+
content: "";
|
112 |
+
height: 26px;
|
113 |
+
width: 26px;
|
114 |
+
left: 4px;
|
115 |
+
bottom: 4px;
|
116 |
+
background-color: white;
|
117 |
+
-webkit-transition: .4s;
|
118 |
+
transition: .4s;
|
119 |
+
}
|
120 |
+
|
121 |
+
input:checked + .slider {
|
122 |
+
background-color: #2196F3;
|
123 |
+
}
|
124 |
+
|
125 |
+
input:focus + .slider {
|
126 |
+
box-shadow: 0 0 1px #2196F3;
|
127 |
+
}
|
128 |
+
|
129 |
+
input:checked + .slider:before {
|
130 |
+
-webkit-transform: translateX(26px);
|
131 |
+
-ms-transform: translateX(26px);
|
132 |
+
transform: translateX(26px);
|
133 |
+
}
|
134 |
+
|
135 |
+
/* Rounded sliders */
|
136 |
+
.slider.round {
|
137 |
+
border-radius: 34px;
|
138 |
+
}
|
139 |
+
|
140 |
+
.slider.round:before {
|
141 |
+
border-radius: 50%;
|
142 |
+
}
|
143 |
+
|
144 |
+
</style>
|
145 |
+
|
146 |
+
</head>
|
147 |
+
<body>
|
148 |
+
<div class="container chat-container">
|
149 |
+
<h1 class="chat-heading">RAG Search</h1>
|
150 |
+
|
151 |
+
|
152 |
+
<!-- Tab Navigation -->
|
153 |
+
<ul class="nav nav-tabs" id="myTab" role="tablist">
|
154 |
+
<li class="nav-item" role="presentation">
|
155 |
+
<button class="nav-link active" id="retriever-tab" data-bs-toggle="tab" data-bs-target="#retriever" type="button" role="tab" aria-controls="retriever" aria-selected="true">Retriever</button>
|
156 |
+
</li>
|
157 |
+
<li class="nav-item" role="presentation">
|
158 |
+
<button class="nav-link" id="ingest-tab" data-bs-toggle="tab" data-bs-target="#ingest" type="button" role="tab" aria-controls="ingest" aria-selected="false">Ingest</button>
|
159 |
+
</li>
|
160 |
+
</ul>
|
161 |
+
|
162 |
+
|
163 |
+
<!-- Language Toggle Switch -->
|
164 |
+
<!-- Tab Content -->
|
165 |
+
<div class="tab-content" id="myTabContent">
|
166 |
+
<!-- Retriever Tab Pane -->
|
167 |
+
<div class="tab-pane fade show active" id="retriever" role="tabpanel" aria-labelledby="retriever-tab">
|
168 |
+
<!-- Language Toggle Switch -->
|
169 |
+
<div class="language-toggle">
|
170 |
+
<label class="switch">
|
171 |
+
<input type="checkbox" class="languageCheckbox"> <!-- Changed to class -->
|
172 |
+
<span class="slider round"></span>
|
173 |
+
</label>
|
174 |
+
<span class="languageLabel">English</span> <!-- Changed to class -->
|
175 |
+
</div>
|
176 |
+
|
177 |
+
<div class="accordion" id="appDescriptionAccordion">
|
178 |
+
<div class="accordion-item">
|
179 |
+
<h2 class="accordion-header" id="descriptionHeading">
|
180 |
+
<button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseDescription" aria-expanded="true" aria-controls="collapseDescription">
|
181 |
+
About This App
|
182 |
+
</button>
|
183 |
+
</h2>
|
184 |
+
|
185 |
+
<div id="collapseDescription" class="accordion-collapse collapse" aria-labelledby="descriptionHeading" data-bs-parent="#appDescriptionAccordion">
|
186 |
+
<div class="accordion-body text-dark">
|
187 |
+
This is a RAG implementation using Open Source stack. Intel's Neural Chat has been used to build this app along with BGE Embeddings as an embedding model, Chroma DB as a vector store, and Langchain & CTransformers as an orchestration frameworks.
|
188 |
+
</div>
|
189 |
+
</div>
|
190 |
+
</div>
|
191 |
+
</div>
|
192 |
+
<!-- Example Queries Section -->
|
193 |
+
<div id="exampleQueries" class="mb-3">
|
194 |
+
<h2 class="h5">Try Example Queries:</h2>
|
195 |
+
<button class="btn btn-sm btn-secondary example-query">What cable can you use to hang a pendant light on?</button>
|
196 |
+
<button class="btn btn-sm btn-secondary example-query">What is the minimal gauge of live wires ?</button>
|
197 |
+
<button class="btn btn-sm btn-secondary example-query">What flamability fequirements do plastic enclosure have to meet ?</button>
|
198 |
+
</div>
|
199 |
+
|
200 |
+
<div class="row">
|
201 |
+
<div class="col">
|
202 |
+
<textarea id="userInput" class="form-control chat-input" placeholder="Type your query here..."></textarea>
|
203 |
+
<button id="submitBtn" class="btn chat-button">Submit</button>
|
204 |
+
<div id="response" class="chat-response"></div>
|
205 |
+
</div>
|
206 |
+
</div>
|
207 |
+
</div>
|
208 |
+
|
209 |
+
|
210 |
+
|
211 |
+
<!-- Ingest Tab Pane -->
|
212 |
+
<div class="tab-pane fade" id="ingest" role="tabpanel" aria-labelledby="ingest-tab">
|
213 |
+
<!-- Language Toggle Switch -->
|
214 |
+
<div class="language-toggle">
|
215 |
+
<label class="switch">
|
216 |
+
<input type="checkbox" class="languageCheckbox"> <!-- Changed to class -->
|
217 |
+
<span class="slider round"></span>
|
218 |
+
</label>
|
219 |
+
<span class="languageLabel">English</span> <!-- Changed to class -->
|
220 |
+
</div>
|
221 |
+
|
222 |
+
<!-- Ingest Content -->
|
223 |
+
<h2>Document Ingestion</h2>
|
224 |
+
<input type="text" id="folderPath" class="form-control my-2" placeholder="Enter the path to your data folder" />
|
225 |
+
<button id="ingestBtn" class="btn chat-button">Ingest</button>
|
226 |
+
<div id="ingestResponse" class="chat-response"></div>
|
227 |
+
</div>
|
228 |
+
</div>
|
229 |
+
</div>
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
+
|
234 |
+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
|
235 |
+
<script>
|
236 |
+
// Function to update language labels and synchronize toggle states
|
237 |
+
function updateLanguageTogglesAndLabels(checked) {
|
238 |
+
document.querySelectorAll('.languageCheckbox').forEach(function(checkbox) {
|
239 |
+
checkbox.checked = checked; // Synchronize toggle state
|
240 |
+
});
|
241 |
+
document.querySelectorAll('.languageLabel').forEach(function(label) {
|
242 |
+
label.textContent = checked ? "Czech" : "English"; // Update all labels
|
243 |
+
});
|
244 |
+
}
|
245 |
+
|
246 |
+
// Attach event listeners to all language toggle switches
|
247 |
+
document.querySelectorAll('.languageCheckbox').forEach(function(checkbox) {
|
248 |
+
checkbox.addEventListener('change', function() {
|
249 |
+
updateLanguageTogglesAndLabels(this.checked);
|
250 |
+
});
|
251 |
+
});
|
252 |
+
|
253 |
+
document.getElementById('submitBtn').addEventListener('click', async function() {
|
254 |
+
var userInput = document.getElementById('userInput').value;
|
255 |
+
|
256 |
+
var languageCheckbox = document.querySelector('.languageCheckbox').checked;
|
257 |
+
var language = languageCheckbox ? "czech" : "english";
|
258 |
+
document.getElementById('response').innerHTML = '<p>Processing...</p>';
|
259 |
+
const formData = new FormData();
|
260 |
+
formData.append('query', userInput);
|
261 |
+
formData.append('language', language);
|
262 |
+
|
263 |
+
try {
|
264 |
+
const response = await fetch('/get_response', {
|
265 |
+
method: 'POST',
|
266 |
+
body: formData
|
267 |
+
});
|
268 |
+
|
269 |
+
if (!response.ok) {
|
270 |
+
throw new Error('Network response was not ok');
|
271 |
+
}
|
272 |
+
|
273 |
+
const data = await response.json();
|
274 |
+
document.getElementById('response').innerHTML = `<p>${data.answer}</p><br><pre><b>Context: </b> ${data.source_document}</pre><br><pre><b>Source Document: </b> ${data.doc}</pre>`;
|
275 |
+
} catch (error) {
|
276 |
+
console.error('Error:', error);
|
277 |
+
document.getElementById('response').innerHTML = '<p>Error processing your request</p>';
|
278 |
+
}
|
279 |
+
});
|
280 |
+
|
281 |
+
// Add event listeners to example queries
|
282 |
+
document.querySelectorAll('.example-query').forEach(item => {
|
283 |
+
item.addEventListener('click', function() {
|
284 |
+
document.getElementById('userInput').value = this.textContent; // Insert clicked query into textarea
|
285 |
+
});
|
286 |
+
});
|
287 |
+
|
288 |
+
// Ingest data
|
289 |
+
document.getElementById('ingestBtn').addEventListener('click', async function() {
|
290 |
+
var folderPath = document.getElementById('folderPath').value;
|
291 |
+
var languageCheckbox = document.querySelector('.languageCheckbox').checked;
|
292 |
+
var language = languageCheckbox ? "czech" : "english"; // Determine the language based on the checkbox
|
293 |
+
document.getElementById('ingestResponse').innerHTML = '<p>Starting ingestion...</p>';
|
294 |
+
try {
|
295 |
+
const response = await fetch('/ingest_data', {
|
296 |
+
method: 'POST',
|
297 |
+
headers: {
|
298 |
+
'Content-Type': 'application/x-www-form-urlencoded',
|
299 |
+
},
|
300 |
+
body: `folderPath=${folderPath}&language=${language}` // Include the language in the request body
|
301 |
+
});
|
302 |
+
|
303 |
+
if (!response.ok) {
|
304 |
+
throw new Error('Network response was not ok');
|
305 |
+
}
|
306 |
+
|
307 |
+
const data = await response.json();
|
308 |
+
document.getElementById('ingestResponse').innerHTML = `<p>${data.message}</p>`;
|
309 |
+
} catch (error) {
|
310 |
+
console.error('Error:', error);
|
311 |
+
document.getElementById('ingestResponse').innerHTML = '<p>Error during ingestion process</p>';
|
312 |
+
}
|
313 |
+
});
|
314 |
+
</script>
|
315 |
+
</body>
|
316 |
+
</html>
|