Teapack1 commited on
Commit
99afe26
·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /libpff-20231205
2
+ /libpff-alpha-20231205.tar.gz
3
+ /venv
4
+ /.chainlit
5
+ /model
6
+ /__pycache__
7
+ .files
8
+ *.gguf
9
+ *.pst
10
+ /data
11
+ *.jpg
12
+ /*.env
13
+ /*.log
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: RAG Retrieve Ingest Cz Eng
3
+ emoji: 😻
4
+ colorFrom: purple
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 4.19.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import subprocess
2
+
3
+ print("Starting the FastAPI server...")
4
+ subprocess.run("uvicorn fast_app:app --host 0.0.0.0 --port 7860", shell=True)
fast_app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ import json
4
+ from fastapi import FastAPI, Request, Form, Response
5
+ from fastapi.responses import HTMLResponse
6
+ from fastapi.templating import Jinja2Templates
7
+ from fastapi.staticfiles import StaticFiles
8
+ from fastapi.encoders import jsonable_encoder
9
+
10
+ from langchain.vectorstores import Chroma
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+
13
+ from langchain.chains import RetrievalQA
14
+ from langchain.document_loaders import (
15
+ TextLoader,
16
+ PyPDFLoader,
17
+ DirectoryLoader,
18
+ UnstructuredFileLoader,
19
+ )
20
+ from langchain.document_loaders.csv_loader import CSVLoader
21
+ from langchain.llms import OpenAI
22
+ from langchain import PromptTemplate
23
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
24
+
25
+ from ingest import Ingest
26
+
27
+ # setx OPENAI_API_KEY "your_openai_api_key_here"
28
+
29
+ # Access the Hugging Face API token from an environment variable
30
+ # huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
31
+ # if huggingface_token is None:
32
+ # raise ValueError("Hugging Face token is not set in environment variables.")
33
+
34
+ openai_api_key = os.getenv("OPENAI_API_KEY")
35
+ if openai_api_key is None:
36
+ raise ValueError("OAI token is not set in environment variables.")
37
+
38
+
39
+ app = FastAPI()
40
+ templates = Jinja2Templates(directory="templates")
41
+ app.mount("/static", StaticFiles(directory="static"), name="static")
42
+
43
+ czech_store = "stores/czech_512"
44
+ english_store = "stores/english_256"
45
+
46
+ ingestor = Ingest(
47
+ openai_api_key=openai_api_key,
48
+ chunk=256,
49
+ overlap=128,
50
+ czech_store=czech_store,
51
+ english_store=english_store,
52
+ )
53
+
54
+ load_dotenv()
55
+
56
+ prompt_template = """You are a electrical engineer focused on lighting and chandeliers. Provide helpful answer to the user question.
57
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
58
+
59
+ Context: {context}
60
+ Question: {question}
61
+
62
+ Only return the helpful answer below and nothing else.
63
+ Helpful answer:
64
+ """
65
+
66
+ prompt = PromptTemplate(
67
+ template=prompt_template, input_variables=["context", "question"]
68
+ )
69
+
70
+ print("\n Prompt ready... \n\n")
71
+
72
+
73
+ @app.get("/", response_class=HTMLResponse)
74
+ def read_item(request: Request):
75
+ return templates.TemplateResponse("index.html", {"request": request})
76
+
77
+
78
+ @app.post("/ingest_data")
79
+ async def ingest_data(folderPath: str = Form(...), language: str = Form(...)):
80
+ # Determine the correct data path and store based on the language
81
+ if language == "czech":
82
+ print("\n Czech language selected....\n\n")
83
+ ingestor.data_czech = folderPath
84
+ ingestor.ingest_czech()
85
+ message = "Czech data ingestion complete."
86
+ else:
87
+ print("\n English language selected....\n\n")
88
+ ingestor.data_english = folderPath
89
+ ingestor.ingest_english()
90
+ message = "English data ingestion complete."
91
+
92
+ return {"message": message}
93
+
94
+
95
+ @app.post("/get_response")
96
+ async def get_response(query: str = Form(...), language: str = Form(...)):
97
+ print(language)
98
+ if language == "czech":
99
+ print("\n Czech language selected....\n\n")
100
+ embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
101
+ persist_directory = czech_store
102
+ model_name = embedding_model
103
+ model_kwargs = {"device": "cpu"}
104
+ encode_kwargs = {"normalize_embeddings": False}
105
+ embedding = HuggingFaceEmbeddings(
106
+ model_name=model_name,
107
+ model_kwargs=model_kwargs,
108
+ encode_kwargs=encode_kwargs,
109
+ )
110
+ else:
111
+ print("\n English language selected....\n\n")
112
+ embedding_model = "text-embedding-3-large" # Default to English
113
+ persist_directory = english_store
114
+ embedding = OpenAIEmbeddings(
115
+ openai_api_key=openai_api_key,
116
+ model=embedding_model,
117
+ )
118
+
119
+ vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
120
+ retriever = vectordb.as_retriever(search_kwargs={"k": 10})
121
+
122
+ chain_type_kwargs = {"prompt": prompt}
123
+ qa_chain = RetrievalQA.from_chain_type(
124
+ llm=OpenAI(openai_api_key=openai_api_key),
125
+ chain_type="stuff",
126
+ retriever=retriever,
127
+ return_source_documents=True,
128
+ chain_type_kwargs=chain_type_kwargs,
129
+ verbose=True,
130
+ )
131
+ response = qa_chain(query)
132
+
133
+ for i in response["source_documents"]:
134
+ print(f"\n{i}\n\n")
135
+
136
+ print(response)
137
+
138
+ answer = response["result"]
139
+ source_document = response["source_documents"][0].page_content
140
+ doc = response["source_documents"][0].metadata["source"]
141
+ response_data = jsonable_encoder(
142
+ json.dumps({"answer": answer, "source_document": source_document, "doc": doc})
143
+ )
144
+
145
+ res = Response(response_data)
146
+ return res
fast_app_cz(obsolete).py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ import json
4
+ from fastapi import FastAPI, Request, Form, Response
5
+ from fastapi.responses import HTMLResponse
6
+ from fastapi.templating import Jinja2Templates
7
+ from fastapi.staticfiles import StaticFiles
8
+ from fastapi.encoders import jsonable_encoder
9
+ from langchain.llms import CTransformers
10
+
11
+ from langchain.vectorstores import Chroma
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+
14
+ from langchain.chains import RetrievalQA
15
+ from langchain.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader
16
+ from langchain.llms import OpenAI
17
+ from langchain import PromptTemplate
18
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
19
+
20
+ app = FastAPI()
21
+ load_dotenv()
22
+ openai_api_key = os.environ.get("OPENAI_API_KEY")
23
+ templates = Jinja2Templates(directory="templates")
24
+ app.mount("/static", StaticFiles(directory="static"), name="static")
25
+ # embedding_model = "Seznam/simcse-dist-mpnet-czeng-cs-en"
26
+ embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
27
+ persist_directory = "stores/seznampara_ul_512"
28
+
29
+ llm = OpenAI(openai_api_key=openai_api_key)
30
+ # llm = "model\dolphin-2.6-mistral-7b.Q4_K_S.gguf"
31
+ # llm = "neural-chat-7b-v3-1.Q4_K_M.gguf"
32
+
33
+
34
+ """
35
+ ### - Local LLM settings - ###
36
+
37
+ config = {
38
+ "max_new_tokens": 1024,
39
+ "repetition_penalty": 1.1,
40
+ "temperature": 0.1,
41
+ "top_k": 50,
42
+ "top_p": 0.9,
43
+ "stream": True,
44
+ "threads": int(os.cpu_count() / 2),
45
+ }
46
+
47
+ llm = CTransformers(
48
+ model=llm, model_type="mistral", lib="avx2", **config # for CPU use
49
+ )
50
+
51
+ ### - Local LLM settings end - ###
52
+ """
53
+
54
+ prompt_template = """Use the following pieces of information to answer the user's question.
55
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
56
+
57
+ Context: {context}
58
+ Question: {question}
59
+
60
+ Only return the helpful answer below and nothing else.
61
+ Helpful answer:
62
+ """
63
+
64
+ prompt = PromptTemplate(
65
+ template=prompt_template, input_variables=["context", "question"]
66
+ )
67
+
68
+ print("\n Prompt ready... \n\n")
69
+
70
+
71
+ model_name = embedding_model
72
+ model_kwargs = {"device": "cpu"}
73
+ encode_kwargs = {"normalize_embeddings": False}
74
+ embedding = HuggingFaceEmbeddings(
75
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
76
+ )
77
+ vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
78
+ retriever = vectordb.as_retriever(search_kwargs={"k": 3})
79
+
80
+ print("\n Retrieval Ready....\n\n")
81
+
82
+
83
+ @app.get("/", response_class=HTMLResponse)
84
+ def read_item(request: Request):
85
+ return templates.TemplateResponse("index.html", {"request": request})
86
+
87
+
88
+ @app.post("/get_response")
89
+ async def get_response(query: str = Form(...)):
90
+
91
+ chain_type_kwargs = {"prompt": prompt}
92
+ qa_chain = RetrievalQA.from_chain_type(
93
+ llm=llm,
94
+ chain_type="stuff",
95
+ retriever=retriever,
96
+ return_source_documents=True,
97
+ chain_type_kwargs=chain_type_kwargs,
98
+ verbose=True,
99
+ )
100
+ response = qa_chain(query)
101
+ print(response)
102
+ answer = response["result"]
103
+ source_document = response["source_documents"][0].page_content
104
+ doc = response["source_documents"][0].metadata["source"]
105
+ response_data = jsonable_encoder(
106
+ json.dumps({"answer": answer, "source_document": source_document, "doc": doc})
107
+ )
108
+
109
+ res = Response(response_data)
110
+ return res
ingest(obsolete).py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import Chroma
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+
4
+ from langchain.document_loaders import (
5
+ PyPDFLoader,
6
+ DirectoryLoader,
7
+ UnstructuredFileLoader,
8
+ )
9
+ from langchain.document_loaders.csv_loader import CSVLoader
10
+ from langchain.embeddings import (
11
+ OpenAIEmbeddings,
12
+ HuggingFaceBgeEmbeddings,
13
+ HuggingFaceEmbeddings,
14
+ HuggingFaceInstructEmbeddings,
15
+ )
16
+
17
+
18
+ persist_directory = "stores/test_512"
19
+ data = "data\czech"
20
+ chunk = 512
21
+ overlap = 128
22
+ # embedding_model = "Seznam/simcse-dist-mpnet-czeng-cs-en"
23
+ embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
24
+
25
+ model_name = embedding_model
26
+ model_kwargs = {"device": "cpu"}
27
+ encode_kwargs = {"normalize_embeddings": False}
28
+ embedding = HuggingFaceEmbeddings(
29
+ model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
30
+ )
31
+
32
+ """
33
+ loader = CSVLoader(
34
+ file_path="data/emails.csv",
35
+ encoding="utf-8",
36
+ csv_args={
37
+ "delimiter": ";",
38
+ },
39
+ )
40
+
41
+ """
42
+
43
+ loader = DirectoryLoader(data, show_progress=True)
44
+
45
+
46
+ documents = loader.load()
47
+ text_splitter = RecursiveCharacterTextSplitter(
48
+ chunk_size=chunk,
49
+ chunk_overlap=overlap,
50
+ )
51
+ texts = text_splitter.split_documents(documents)
52
+ vectordb = Chroma.from_documents(
53
+ documents=texts,
54
+ embedding=embedding,
55
+ persist_directory=persist_directory,
56
+ collection_metadata={"hnsw:space": "cosine"},
57
+ )
58
+
59
+ print("\n Vector Store Created.......\n\n")
ingest.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import Chroma
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+
4
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader
5
+ from langchain.embeddings import (
6
+ OpenAIEmbeddings,
7
+ HuggingFaceBgeEmbeddings,
8
+ HuggingFaceEmbeddings,
9
+ HuggingFaceInstructEmbeddings,
10
+ )
11
+
12
+
13
+ class Ingest:
14
+ def __init__(
15
+ self,
16
+ openai_api_key=None,
17
+ chunk=512,
18
+ overlap=256,
19
+ czech_store="stores/czech_512",
20
+ english_store="stores/english_512",
21
+ data_czech="data/czech",
22
+ data_english="data/english",
23
+ ):
24
+ self.openai_api_key = openai_api_key
25
+ self.chunk = chunk
26
+ self.overlap = overlap
27
+ self.czech_store = czech_store
28
+ self.english_store = english_store
29
+ self.data_czech = data_czech
30
+ self.data_english = data_english
31
+
32
+ def ingest_english(self):
33
+
34
+ embedding = OpenAIEmbeddings(
35
+ openai_api_key=self.openai_api_key,
36
+ model="text-embedding-3-large",
37
+ )
38
+
39
+ loader = DirectoryLoader(
40
+ self.data_english,
41
+ show_progress=True,
42
+ )
43
+
44
+ documents = loader.load()
45
+ text_splitter = RecursiveCharacterTextSplitter(
46
+ chunk_size=self.chunk,
47
+ chunk_overlap=self.overlap,
48
+ )
49
+ texts = text_splitter.split_documents(documents)
50
+
51
+ vectordb = Chroma.from_documents(
52
+ documents=texts,
53
+ embedding=embedding,
54
+ persist_directory=self.english_store,
55
+ collection_metadata={"hnsw:space": "cosine"},
56
+ )
57
+
58
+ print("\n English vector Store Created.......\n\n")
59
+
60
+ def ingest_czech(self):
61
+ embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
62
+ model_kwargs = {"device": "cpu"}
63
+ encode_kwargs = {"normalize_embeddings": False}
64
+ embedding = HuggingFaceEmbeddings(
65
+ model_name=embedding_model,
66
+ model_kwargs=model_kwargs,
67
+ encode_kwargs=encode_kwargs,
68
+ )
69
+
70
+ loader = DirectoryLoader(
71
+ self.data_czech,
72
+ show_progress=True,
73
+ )
74
+
75
+ documents = loader.load()
76
+ text_splitter = RecursiveCharacterTextSplitter(
77
+ chunk_size=self.chunk,
78
+ chunk_overlap=self.overlap,
79
+ )
80
+
81
+ texts = text_splitter.split_documents(documents)
82
+ vectordb = Chroma.from_documents(
83
+ documents=texts,
84
+ embedding=embedding,
85
+ persist_directory=self.czech_store,
86
+ collection_metadata={"hnsw:space": "cosine"},
87
+ )
88
+
89
+ print("\n Czech vector Store Created.......\n\n")
90
+
91
+
92
+ """
93
+
94
+
95
+
96
+ openai_api_key = "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"
97
+ persist_directory = "stores/store_512"
98
+ data = "data/"
99
+ chunk = 512
100
+ overlap = 256
101
+
102
+ embedding = OpenAIEmbeddings(
103
+ openai_api_key=openai_api_key,
104
+ model="text-embedding-3-large",
105
+ # model_kwargs={"device": "cpu"},
106
+ )
107
+
108
+ loader = DirectoryLoader(
109
+ data, glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
110
+ )
111
+ documents = loader.load()
112
+ text_splitter = RecursiveCharacterTextSplitter(
113
+ chunk_size=chunk,
114
+ chunk_overlap=overlap,
115
+ )
116
+ texts = text_splitter.split_documents(documents)
117
+
118
+ vectordb = Chroma.from_documents(
119
+ documents=texts,
120
+ embedding=embedding,
121
+ persist_directory=persist_directory,
122
+ collection_metadata={"hnsw:space": "cosine"},
123
+ )
124
+
125
+ print("\n Vector Store Created.......\n\n")
126
+
127
+ """
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ fastapi
3
+ uvicorn
4
+ python-multipart
5
+ ctransformers
6
+ qdrant-client
7
+ torch
8
+ sentence_transformers
9
+ chromadb
10
+ pytesseract
11
+ fitz
12
+ libpff-python
13
+ openai
14
+ tiktoken
15
+ frontend
support/images-text.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ from PIL import Image
3
+ import pytesseract
4
+ import io
5
+ import os
6
+
7
+ def extract_images_from_pdf(pdf_path, output_folder, dpi=300):
8
+ doc = fitz.open(pdf_path)
9
+ images = []
10
+ for page_num in range(len(doc)):
11
+ page = doc.load_page(page_num)
12
+ pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))
13
+ img_bytes = pix.tobytes("ppm")
14
+
15
+ # Convert PPM to JPEG using PIL and save
16
+ image = Image.open(io.BytesIO(img_bytes))
17
+ output_filename = f"{output_folder}/page_{page_num + 1}.jpg"
18
+ image.save(output_filename, "JPEG")
19
+
20
+ # Also store the image bytes for OCR processing
21
+ img_converted_bytes = io.BytesIO()
22
+ image.save(img_converted_bytes, format='JPEG')
23
+ images.append(img_converted_bytes.getvalue())
24
+
25
+ print(f"Images saved to {output_folder}")
26
+ return images
27
+
28
+ def ocr_images(images):
29
+ text_from_images = []
30
+ for image_bytes in images:
31
+ image = Image.open(io.BytesIO(image_bytes))
32
+ text = pytesseract.image_to_string(image, lang='ces')
33
+ text_from_images.append(text)
34
+ return text_from_images
35
+
36
+ def save_text_to_file(text, file_path):
37
+ with open(file_path, 'w', encoding='utf-8') as file:
38
+ file.write(text)
39
+
40
+ # Path to your PDF file and output locations
41
+ pdf_path = "norm.pdf"
42
+ output_folder = "extracted_images"
43
+ output_text_file = "ocr_results.txt"
44
+
45
+ # Ensure the output directory exists
46
+ os.makedirs(output_folder, exist_ok=True)
47
+
48
+ # Extract images from the PDF and save them as JPEG
49
+ images = extract_images_from_pdf(pdf_path, output_folder)
50
+
51
+ # Transcribe text from the extracted images with Czech language
52
+ transcribed_texts = ocr_images(images)
53
+
54
+ # Combine all texts into a single string
55
+ combined_text = "\n".join(transcribed_texts)
56
+
57
+ # Save OCR results to a text file
58
+ save_text_to_file(combined_text, output_text_file)
59
+
60
+ print(f"OCR results saved to {output_text_file}")
support/process_archives.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pypff
2
+ import re
3
+ from datetime import datetime
4
+ import pandas as pd
5
+
6
+ print(f"[INFO] pypff version: {pypff.get_version()}")
7
+
8
+ DATA_PATH = "data\Archiv_2023_2.pst"
9
+
10
+ # Patterns to match and remove
11
+ patterns = [
12
+ r"\bM\s\+\d{3}\d{3}\d{3}\d{3}\b", # Phone numbers
13
+ r"\bP\s\+\d{3}\d{3}\d{3}\d{3}\b", # Phone numbers
14
+ r"\S+@\S+", # Email addresses
15
+ r"http[s]?://\S+", # URLs
16
+ r"preciosalighting\.com\s*<",
17
+ r"Facebook\s*<", # Social Media links
18
+ r"Instagram\s*<",
19
+ r"Youtube\s*<",
20
+ r"Pinterest\s*<",
21
+ r"Linkedin\s*<",
22
+ r"_+", # Line of underscores
23
+ # Czech legal disclaimer
24
+ r"Tento e-mail je určen pouze.*od odesílatele k adresátovi\.",
25
+ # English legal disclaimer
26
+ r"This e-mail transmission is intended solely.*from the sender to the recipient\.",
27
+ r"From:.*\n?",
28
+ r"Sent:.*\n?",
29
+ r"To:.*\n?",
30
+ r"Cc:.*\n?",
31
+ r"Subject:.*\n?",
32
+ r";", # Semicolons
33
+ r"[^\w\s,.]",
34
+ ]
35
+
36
+
37
+ def extract_emails(pst_file):
38
+ opened_pst = pypff.open(pst_file)
39
+ root = opened_pst.get_root_folder()
40
+
41
+ emails = []
42
+
43
+ def process_folder(folder):
44
+ for folder in folder.sub_folders:
45
+ process_folder(folder)
46
+ for message in folder.sub_messages:
47
+ emails.append(
48
+ {
49
+ "subject": message.subject,
50
+ "body": message.plain_text_body,
51
+ "sender": message.sender_name,
52
+ "date": message.delivery_time,
53
+ }
54
+ )
55
+
56
+ process_folder(root)
57
+ return emails
58
+
59
+
60
+ def format_item(item, patterns):
61
+ date = item["date"].strftime("%Y-%m-%d")
62
+ body = item["body"].decode("utf-8")
63
+ for pattern in patterns:
64
+ body = re.sub(pattern, "", body)
65
+ body = re.sub("\s+", " ", body).strip()
66
+
67
+ return {
68
+ "subject": item["subject"],
69
+ "body": body,
70
+ "sender": item["sender"],
71
+ "date": date,
72
+ }
73
+
74
+
75
+ def main():
76
+ dataset_list = []
77
+ emails = extract_emails(DATA_PATH)
78
+ for email in emails:
79
+ dataset_list.append(format_item(email, patterns))
80
+
81
+ df = pd.DataFrame(dataset_list)
82
+ df.head()
83
+ df.to_csv("data\emails.csv", index=True, header=True, sep=";")
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
templates/index.html ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>RAG question answer bot</title>
7
+ <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@400;600&display=swap" rel="stylesheet">
8
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
9
+ <style>
10
+ body {
11
+ background-color: black;
12
+ font-family: 'Poppins', sans-serif;
13
+ color: white;
14
+ }
15
+ .chat-container {
16
+ max-width: 800px;
17
+ margin: 50px auto;
18
+ margin-top: 10%;
19
+ padding: 20px;
20
+ background-color: #333;
21
+ border-radius: 10px;
22
+ }
23
+ .chat-heading {
24
+ text-align: center;
25
+ font-size: 2.5em;
26
+ font-weight: 600;
27
+ margin-bottom: 30px;
28
+ color: #ffd700; /* Golden color for the heading */
29
+ }
30
+ .chat-input {
31
+ margin-top: 20px; /* Added margin */
32
+ margin-bottom: 20px;
33
+ height: 100px; /* Increased height */
34
+ }
35
+ .chat-button {
36
+ background-color: green;
37
+ color: white;
38
+ padding: 10px 20px;
39
+ font-size: 1.2em;
40
+ }
41
+ .chat-response {
42
+ background-color: #444;
43
+ padding: 15px;
44
+ border-radius: 5px;
45
+ min-height: 100px; /* Minimum height for the response box */
46
+ margin-top: 20px;
47
+ }
48
+ .accordion {
49
+
50
+ background-color: #444;
51
+ border-radius: 5px;
52
+ }
53
+ .accordion-button {
54
+ color: white;
55
+ background-color: #555;
56
+ }
57
+ .accordion-body {
58
+ color: white; /* Improved visibility of text */
59
+ }
60
+ pre {
61
+ white-space:pre-wrap;
62
+ }
63
+ #exampleQueries {
64
+ margin-top: 10px;
65
+ text-align: center;
66
+ }
67
+
68
+ #exampleQueries button {
69
+ margin-right: 10px;
70
+ margin-bottom: 5px;
71
+ }
72
+
73
+
74
+ /* The switch - the box around the slider */
75
+
76
+ .language-toggle {
77
+ text-align: right;
78
+ margin: 16px;
79
+ position: relative; /* Added position relative to align the label */
80
+ }
81
+
82
+ .switch {
83
+ position: relative;
84
+ display: inline-block;
85
+ width: 60px;
86
+ height: 34px;
87
+ }
88
+
89
+ /* Hide default HTML checkbox */
90
+ .switch input {
91
+ opacity: 0;
92
+ width: 0;
93
+ height: 0;
94
+ }
95
+
96
+ /* The slider */
97
+ .slider {
98
+ position: absolute;
99
+ cursor: pointer;
100
+ top: 0;
101
+ left: 0;
102
+ right: 0;
103
+ bottom: 0;
104
+ background-color: #ccc;
105
+ -webkit-transition: .4s;
106
+ transition: .4s;
107
+ }
108
+
109
+ .slider:before {
110
+ position: absolute;
111
+ content: "";
112
+ height: 26px;
113
+ width: 26px;
114
+ left: 4px;
115
+ bottom: 4px;
116
+ background-color: white;
117
+ -webkit-transition: .4s;
118
+ transition: .4s;
119
+ }
120
+
121
+ input:checked + .slider {
122
+ background-color: #2196F3;
123
+ }
124
+
125
+ input:focus + .slider {
126
+ box-shadow: 0 0 1px #2196F3;
127
+ }
128
+
129
+ input:checked + .slider:before {
130
+ -webkit-transform: translateX(26px);
131
+ -ms-transform: translateX(26px);
132
+ transform: translateX(26px);
133
+ }
134
+
135
+ /* Rounded sliders */
136
+ .slider.round {
137
+ border-radius: 34px;
138
+ }
139
+
140
+ .slider.round:before {
141
+ border-radius: 50%;
142
+ }
143
+
144
+ </style>
145
+
146
+ </head>
147
+ <body>
148
+ <div class="container chat-container">
149
+ <h1 class="chat-heading">RAG Search</h1>
150
+
151
+
152
+ <!-- Tab Navigation -->
153
+ <ul class="nav nav-tabs" id="myTab" role="tablist">
154
+ <li class="nav-item" role="presentation">
155
+ <button class="nav-link active" id="retriever-tab" data-bs-toggle="tab" data-bs-target="#retriever" type="button" role="tab" aria-controls="retriever" aria-selected="true">Retriever</button>
156
+ </li>
157
+ <li class="nav-item" role="presentation">
158
+ <button class="nav-link" id="ingest-tab" data-bs-toggle="tab" data-bs-target="#ingest" type="button" role="tab" aria-controls="ingest" aria-selected="false">Ingest</button>
159
+ </li>
160
+ </ul>
161
+
162
+
163
+ <!-- Language Toggle Switch -->
164
+ <!-- Tab Content -->
165
+ <div class="tab-content" id="myTabContent">
166
+ <!-- Retriever Tab Pane -->
167
+ <div class="tab-pane fade show active" id="retriever" role="tabpanel" aria-labelledby="retriever-tab">
168
+ <!-- Language Toggle Switch -->
169
+ <div class="language-toggle">
170
+ <label class="switch">
171
+ <input type="checkbox" class="languageCheckbox"> <!-- Changed to class -->
172
+ <span class="slider round"></span>
173
+ </label>
174
+ <span class="languageLabel">English</span> <!-- Changed to class -->
175
+ </div>
176
+
177
+ <div class="accordion" id="appDescriptionAccordion">
178
+ <div class="accordion-item">
179
+ <h2 class="accordion-header" id="descriptionHeading">
180
+ <button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#collapseDescription" aria-expanded="true" aria-controls="collapseDescription">
181
+ About This App
182
+ </button>
183
+ </h2>
184
+
185
+ <div id="collapseDescription" class="accordion-collapse collapse" aria-labelledby="descriptionHeading" data-bs-parent="#appDescriptionAccordion">
186
+ <div class="accordion-body text-dark">
187
+ This is a RAG implementation using Open Source stack. Intel's Neural Chat has been used to build this app along with BGE Embeddings as an embedding model, Chroma DB as a vector store, and Langchain & CTransformers as an orchestration frameworks.
188
+ </div>
189
+ </div>
190
+ </div>
191
+ </div>
192
+ <!-- Example Queries Section -->
193
+ <div id="exampleQueries" class="mb-3">
194
+ <h2 class="h5">Try Example Queries:</h2>
195
+ <button class="btn btn-sm btn-secondary example-query">What cable can you use to hang a pendant light on?</button>
196
+ <button class="btn btn-sm btn-secondary example-query">What is the minimal gauge of live wires ?</button>
197
+ <button class="btn btn-sm btn-secondary example-query">What flamability fequirements do plastic enclosure have to meet ?</button>
198
+ </div>
199
+
200
+ <div class="row">
201
+ <div class="col">
202
+ <textarea id="userInput" class="form-control chat-input" placeholder="Type your query here..."></textarea>
203
+ <button id="submitBtn" class="btn chat-button">Submit</button>
204
+ <div id="response" class="chat-response"></div>
205
+ </div>
206
+ </div>
207
+ </div>
208
+
209
+
210
+
211
+ <!-- Ingest Tab Pane -->
212
+ <div class="tab-pane fade" id="ingest" role="tabpanel" aria-labelledby="ingest-tab">
213
+ <!-- Language Toggle Switch -->
214
+ <div class="language-toggle">
215
+ <label class="switch">
216
+ <input type="checkbox" class="languageCheckbox"> <!-- Changed to class -->
217
+ <span class="slider round"></span>
218
+ </label>
219
+ <span class="languageLabel">English</span> <!-- Changed to class -->
220
+ </div>
221
+
222
+ <!-- Ingest Content -->
223
+ <h2>Document Ingestion</h2>
224
+ <input type="text" id="folderPath" class="form-control my-2" placeholder="Enter the path to your data folder" />
225
+ <button id="ingestBtn" class="btn chat-button">Ingest</button>
226
+ <div id="ingestResponse" class="chat-response"></div>
227
+ </div>
228
+ </div>
229
+ </div>
230
+
231
+
232
+
233
+
234
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
235
+ <script>
236
+ // Function to update language labels and synchronize toggle states
237
+ function updateLanguageTogglesAndLabels(checked) {
238
+ document.querySelectorAll('.languageCheckbox').forEach(function(checkbox) {
239
+ checkbox.checked = checked; // Synchronize toggle state
240
+ });
241
+ document.querySelectorAll('.languageLabel').forEach(function(label) {
242
+ label.textContent = checked ? "Czech" : "English"; // Update all labels
243
+ });
244
+ }
245
+
246
+ // Attach event listeners to all language toggle switches
247
+ document.querySelectorAll('.languageCheckbox').forEach(function(checkbox) {
248
+ checkbox.addEventListener('change', function() {
249
+ updateLanguageTogglesAndLabels(this.checked);
250
+ });
251
+ });
252
+
253
+ document.getElementById('submitBtn').addEventListener('click', async function() {
254
+ var userInput = document.getElementById('userInput').value;
255
+
256
+ var languageCheckbox = document.querySelector('.languageCheckbox').checked;
257
+ var language = languageCheckbox ? "czech" : "english";
258
+ document.getElementById('response').innerHTML = '<p>Processing...</p>';
259
+ const formData = new FormData();
260
+ formData.append('query', userInput);
261
+ formData.append('language', language);
262
+
263
+ try {
264
+ const response = await fetch('/get_response', {
265
+ method: 'POST',
266
+ body: formData
267
+ });
268
+
269
+ if (!response.ok) {
270
+ throw new Error('Network response was not ok');
271
+ }
272
+
273
+ const data = await response.json();
274
+ document.getElementById('response').innerHTML = `<p>${data.answer}</p><br><pre><b>Context: </b> ${data.source_document}</pre><br><pre><b>Source Document: </b> ${data.doc}</pre>`;
275
+ } catch (error) {
276
+ console.error('Error:', error);
277
+ document.getElementById('response').innerHTML = '<p>Error processing your request</p>';
278
+ }
279
+ });
280
+
281
+ // Add event listeners to example queries
282
+ document.querySelectorAll('.example-query').forEach(item => {
283
+ item.addEventListener('click', function() {
284
+ document.getElementById('userInput').value = this.textContent; // Insert clicked query into textarea
285
+ });
286
+ });
287
+
288
+ // Ingest data
289
+ document.getElementById('ingestBtn').addEventListener('click', async function() {
290
+ var folderPath = document.getElementById('folderPath').value;
291
+ var languageCheckbox = document.querySelector('.languageCheckbox').checked;
292
+ var language = languageCheckbox ? "czech" : "english"; // Determine the language based on the checkbox
293
+ document.getElementById('ingestResponse').innerHTML = '<p>Starting ingestion...</p>';
294
+ try {
295
+ const response = await fetch('/ingest_data', {
296
+ method: 'POST',
297
+ headers: {
298
+ 'Content-Type': 'application/x-www-form-urlencoded',
299
+ },
300
+ body: `folderPath=${folderPath}&language=${language}` // Include the language in the request body
301
+ });
302
+
303
+ if (!response.ok) {
304
+ throw new Error('Network response was not ok');
305
+ }
306
+
307
+ const data = await response.json();
308
+ document.getElementById('ingestResponse').innerHTML = `<p>${data.message}</p>`;
309
+ } catch (error) {
310
+ console.error('Error:', error);
311
+ document.getElementById('ingestResponse').innerHTML = '<p>Error during ingestion process</p>';
312
+ }
313
+ });
314
+ </script>
315
+ </body>
316
+ </html>