Hemasagar commited on
Commit
79b1b39
·
verified ·
1 Parent(s): c9fcae3
Files changed (31) hide show
  1. RD/.DS_Store +0 -0
  2. RD/.gitattributes +0 -35
  3. RD/AI_full_stack_repository/.DS_Store +0 -0
  4. RD/AI_full_stack_repository/README.md +0 -3
  5. RD/AI_full_stack_repository/app.py +0 -29
  6. RD/AI_full_stack_repository/config.yml +0 -11
  7. RD/AI_full_stack_repository/data/Doc-1.pdf +0 -0
  8. RD/AI_full_stack_repository/data/Doc-2.pdf +0 -0
  9. RD/AI_full_stack_repository/data/invoice_1.pdf +0 -0
  10. RD/AI_full_stack_repository/ingest.py +0 -34
  11. RD/AI_full_stack_repository/llm/__init__.py +0 -0
  12. RD/AI_full_stack_repository/llm/__pycache__/__init__.cpython-310.pyc +0 -0
  13. RD/AI_full_stack_repository/llm/__pycache__/__init__.cpython-312.pyc +0 -0
  14. RD/AI_full_stack_repository/llm/__pycache__/llm.cpython-310.pyc +0 -0
  15. RD/AI_full_stack_repository/llm/__pycache__/llm.cpython-312.pyc +0 -0
  16. RD/AI_full_stack_repository/llm/__pycache__/prompts.cpython-310.pyc +0 -0
  17. RD/AI_full_stack_repository/llm/__pycache__/prompts.cpython-312.pyc +0 -0
  18. RD/AI_full_stack_repository/llm/__pycache__/wrapper.cpython-310.pyc +0 -0
  19. RD/AI_full_stack_repository/llm/__pycache__/wrapper.cpython-312.pyc +0 -0
  20. RD/AI_full_stack_repository/llm/llm.py +0 -29
  21. RD/AI_full_stack_repository/llm/prompts.py +0 -13
  22. RD/AI_full_stack_repository/llm/test.py +0 -95
  23. RD/AI_full_stack_repository/llm/wrapper.py +0 -51
  24. RD/AI_full_stack_repository/models/.DS_Store +0 -0
  25. RD/AI_full_stack_repository/models/model_download.txt +0 -1
  26. RD/AI_full_stack_repository/requirements.txt +0 -13
  27. RD/AI_full_stack_repository/screenshot_images/.DS_Store +0 -0
  28. RD/AI_full_stack_repository/screenshot_images/invoice_image.png +0 -0
  29. RD/AI_full_stack_repository/vectorestore/.DS_Store +0 -0
  30. RD/AI_full_stack_repository/vectorestore/chroma/.DS_Store +0 -0
  31. RD/README.md +0 -13
RD/.DS_Store DELETED
Binary file (6.15 kB)
 
RD/.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
RD/AI_full_stack_repository/.DS_Store DELETED
Binary file (6.15 kB)
 
RD/AI_full_stack_repository/README.md DELETED
@@ -1,3 +0,0 @@
1
- # AI_full_stack_repository
2
- I built a LLM of Mistral7b-based chat with text by using my full-stack AI skills at work. On top of that, I created a prototype using the Streamlit API (module).
3
- # sample_test
 
 
 
 
RD/AI_full_stack_repository/app.py DELETED
@@ -1,29 +0,0 @@
1
- from ingest import run_ingest
2
- from llm.wrapper import setup_qa_chain
3
- from llm.wrapper import query_embeddings
4
- import timeit
5
-
6
-
7
- import streamlit as st
8
- def main():
9
- st.set_page_config(page_title="Document seemless process ")
10
- st.title("Auto text extraction with AI Planet ")
11
- st.subheader("I can help you in extracting text from pdf,documents ....")
12
- pdf = st.file_uploader("Upload text here for now, only PDF files allowed ", type=["pdf","txt"],accept_multiple_files=True)
13
- submit=st.button("Extract Data")
14
- if submit:
15
- with st.spinner('Wait for it...'):
16
- run_ingest()
17
- question = st.text_input("Please wirte a Query: ", key="Please ask question on uploaded pdf")
18
- submit = st.button('Generate')
19
- if submit:
20
- with st.spinner('Wait for it...'):
21
- qa_chain = setup_qa_chain()
22
- response = qa_chain({'query': question})
23
- answer = {'answer': response['result']}
24
- st.subheader("Answer:")
25
- st.write(answer)
26
- st.success("Hope I was able to save your time❤️")
27
- #Invoking main function
28
- if __name__ == '__main__':
29
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
RD/AI_full_stack_repository/config.yml DELETED
@@ -1,11 +0,0 @@
1
- RETURN_SOURCE_DOCUMENTS: True
2
- VECTOR_COUNT: 2
3
- CHUNK_SIZE: 300
4
- CHUNK_OVERLAP: 30
5
- DATA_PATH: 'data/'
6
- DB_FAISS_PATH: 'vectorstore/chroma'
7
- MODEL_TYPE: 'mistral'
8
- MODEL_BIN_PATH: 'models/mistral-7b-instruct-v0.1.Q5_K_M.gguf'
9
- EMBEDDINGS: 'sentence-transformers/all-mpnet-base-v2'
10
- MAX_NEW_TOKENS: 2048
11
- TEMPERATURE: 0.00
 
 
 
 
 
 
 
 
 
 
 
 
RD/AI_full_stack_repository/data/Doc-1.pdf DELETED
Binary file (10.1 kB)
 
RD/AI_full_stack_repository/data/Doc-2.pdf DELETED
Binary file (12.2 kB)
 
RD/AI_full_stack_repository/data/invoice_1.pdf DELETED
Binary file (45.3 kB)
 
RD/AI_full_stack_repository/ingest.py DELETED
@@ -1,34 +0,0 @@
1
- import box
2
- import yaml
3
- from langchain.vectorstores import FAISS
4
- from langchain.document_loaders import PyPDFDirectoryLoader
5
- from langchain.text_splitter import CharacterTextSplitter
6
- from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- from langchain.document_loaders import PyPDFLoader, DirectoryLoader
8
- from langchain.embeddings import HuggingFaceEmbeddings
9
- from langchain_community.embeddings.sentence_transformer import (
10
- SentenceTransformerEmbeddings,
11
- )
12
- from langchain.vectorstores import Chroma
13
-
14
- # Import config vars
15
- with open('config.yml', 'r', encoding='utf8') as ymlfile:
16
- cfg = box.Box(yaml.safe_load(ymlfile))
17
-
18
-
19
- def run_ingest():
20
- loader = DirectoryLoader(cfg.DATA_PATH,
21
- glob='*.pdf',
22
- loader_cls=PyPDFLoader)
23
-
24
- documents = loader.load()
25
-
26
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20,length_function =len,add_start_index = True)
27
- text = text_splitter.split_documents(documents)
28
- embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
29
- # load it into Chroma
30
- # save to disk
31
- db2 = Chroma.from_documents(text, embedding_function, persist_directory="./vectorestore/db_faiss")
32
-
33
- if __name__ == "__main__":
34
- run_ingest()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
RD/AI_full_stack_repository/llm/__init__.py DELETED
File without changes
RD/AI_full_stack_repository/llm/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (160 Bytes)
 
RD/AI_full_stack_repository/llm/__pycache__/__init__.cpython-312.pyc DELETED
Binary file (164 Bytes)
 
RD/AI_full_stack_repository/llm/__pycache__/llm.cpython-310.pyc DELETED
Binary file (629 Bytes)
 
RD/AI_full_stack_repository/llm/__pycache__/llm.cpython-312.pyc DELETED
Binary file (970 Bytes)
 
RD/AI_full_stack_repository/llm/__pycache__/prompts.cpython-310.pyc DELETED
Binary file (454 Bytes)
 
RD/AI_full_stack_repository/llm/__pycache__/prompts.cpython-312.pyc DELETED
Binary file (465 Bytes)
 
RD/AI_full_stack_repository/llm/__pycache__/wrapper.cpython-310.pyc DELETED
Binary file (1.9 kB)
 
RD/AI_full_stack_repository/llm/__pycache__/wrapper.cpython-312.pyc DELETED
Binary file (2.7 kB)
 
RD/AI_full_stack_repository/llm/llm.py DELETED
@@ -1,29 +0,0 @@
1
- from langchain.llms import CTransformers
2
- import box
3
- import yaml
4
- from langchain.llms import LlamaCpp
5
- config={'max_new_tokens': 2000,
6
- 'temperature': 0.01,
7
- "context_length" : 4000}
8
- # Import config vars
9
- with open('config.yml', 'r', encoding='utf8') as ymlfile:
10
- cfg = box.Box(yaml.safe_load(ymlfile))
11
-
12
-
13
- def setup_llm():
14
- # llm = CTransformers(model=cfg.MODEL_BIN_PATH,
15
- # model_type=cfg.MODEL_TYPE,
16
- # max_new_tokens=cfg.MAX_NEW_TOKENS,
17
- # temperature=cfg.TEMPERATURE
18
- # )
19
- llm = LlamaCpp(
20
- streaming = True,
21
- model_path=cfg.MODEL_BIN_PATH,#"mistral-7b-instruct-v0.1.Q4_K_M.gguf",
22
- temperature=0.75,
23
- top_p=1,
24
- verbose=True,
25
- n_ctx=4096
26
- )
27
-
28
-
29
- return llm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
RD/AI_full_stack_repository/llm/prompts.py DELETED
@@ -1,13 +0,0 @@
1
- # Note: Precise formatting of spacing and indentation of the prompt template is important,
2
- # as it is highly sensitive to whitespace changes. For example, it could have problems generating
3
- # a summary from the pieces of context if the spacing is not done correctly
4
-
5
- qa_template = """Use the following pieces of information to answer the user's question.
6
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
7
-
8
- Context: {context}
9
- Question: {question}
10
-
11
- Only return the helpful answer below and nothing else.
12
- Helpful answer:
13
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
RD/AI_full_stack_repository/llm/test.py DELETED
@@ -1,95 +0,0 @@
1
- # from langchain.vectorstores import Chroma
2
- # # from langchain_chroma import Chroma
3
- # from langchain_community.document_loaders import TextLoader
4
- # from langchain_community.embeddings.sentence_transformer import (
5
- # SentenceTransformerEmbeddings,
6
- # )
7
- # from langchain.document_loaders import PyPDFDirectoryLoader
8
- # from langchain_text_splitters import CharacterTextSplitter
9
- # from langchain.text_splitter import CharacterTextSplitter
10
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- # import os
12
- # os.getcwd()
13
-
14
- # #Load Documents
15
- # def file_loader(filename):
16
- # if filename.endswith('.txt'):
17
- # # load the text document and split it into chunks
18
- # loader = TextLoader(filename)
19
- # documents = loader.load()
20
- # return documents
21
- # #Loads pdf files available in a directory with pypdf
22
- # elif filename.endswith('.pdf'):
23
- # loader = PyPDFDirectoryLoader(filename)
24
- # documents = loader.load()
25
- # return documents
26
- # filename = '/data'
27
- # def load_docs(directory):
28
- # loader = PyPDFDirectoryLoader(directory)
29
- # documents = loader.load()
30
- # if not documents:
31
- # raise ValueError(f"No documents loaded from directory: {directory}")
32
- # return documents
33
- # documents = load_docs(filename)
34
- # print(f"Number of loaded documents: {len(documents)}")
35
-
36
- # # split it into chunks
37
- # def split_docs(documents, chunk_size=2000, chunk_overlap=20):
38
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
39
- # docs = text_splitter.split_documents(documents)
40
- # if not docs:
41
- # raise ValueError("Document splitting resulted in an empty list.")
42
- # return docs
43
- # docs = split_docs(documents)
44
- # print(f"Number of document chunks: {len(docs)}")
45
-
46
-
47
- # # Generate text embeddings
48
- # #Huggingface LLM for creating Embeddings for documents/text
49
-
50
- # # create the open-source embedding function
51
- # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
52
-
53
- # # load it into Chroma
54
- # db = Chroma.from_documents(docs, embedding_function)
55
-
56
- # # query it
57
- # query = "What is invoice number?"
58
- # docs = db.similarity_search(query)
59
-
60
- # # print results
61
- # print(docs[0].page_content)
62
-
63
- #---------------------------------------------------------PDF-READER------------------------------------------------------------------
64
- # import easyocr
65
- # reader = easyocr.Reader(['en'])
66
- # result = reader.readtext(r'/Users/hemasagarendluri1996/llm-mistral-invoice-cpu/screenshot_images/invoice_image.png')
67
- # for detection in result:
68
- # print(detection[1])
69
- import streamlit as st
70
-
71
- #Hello! It seems like you want to import the Streamlit library in Python. Streamlit is a powerful open-source framework used for building web applications with interactive data visualizations and machine learning models. To import Streamlit, you'll need to ensure that you have it installed in your Python environment.
72
- #Once you have Streamlit installed, you can import it into your Python script using the import statement,
73
- def main():
74
-
75
- st.set_page_config(page_title="Document seemless process ")
76
- st.title("Auto text extraction with AI Planet ")
77
- st.subheader("I can help you in extracting text from pdf,documents ....")
78
-
79
-
80
- # Upload the Invoices (pdf files)...
81
- pdf = st.file_uploader("Upload invoices here for now, only PDF files allowed and will accept other formate as well", type=["pdf"],accept_multiple_files=True)
82
-
83
- submit=st.button("Extract Data")
84
- response = 4+5
85
- if submit:
86
- with st.spinner('Wait for it...'):
87
- st.subheader("Answer:")
88
- st.write(response)
89
-
90
-
91
-
92
-
93
- #Invoking main function
94
- if __name__ == '__main__':
95
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
RD/AI_full_stack_repository/llm/wrapper.py DELETED
@@ -1,51 +0,0 @@
1
- import box
2
- import yaml
3
- from langchain.prompts import PromptTemplate
4
- from langchain.chains import RetrievalQA
5
- # from langchain.embeddings import HuggingFaceEmbeddings
6
- from langchain.vectorstores import FAISS
7
- from llm.prompts import qa_template
8
- from llm.llm import setup_llm
9
- from langchain_community.embeddings.sentence_transformer import (
10
- SentenceTransformerEmbeddings,
11
- )
12
- from langchain.vectorstores import Chroma
13
- # Import config vars
14
- with open('config.yml', 'r', encoding='utf8') as ymlfile:
15
- cfg = box.Box(yaml.safe_load(ymlfile))
16
- def set_qa_prompt():
17
- """
18
- Prompt template for QA retrieval for each vectorstore
19
- """
20
- prompt = PromptTemplate(template=qa_template,
21
- input_variables=['context', 'question'])
22
- return prompt
23
-
24
- def build_retrieval_qa_chain(llm, prompt):
25
- # create the open-source embedding function
26
- embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
27
- # load from disk
28
- chromadb = Chroma(persist_directory="./vectorestore/db_faiss", embedding_function=embedding_function)
29
- retriever = chromadb.as_retriever(search_kwargs={'k': cfg.VECTOR_COUNT})
30
- qa_chain = RetrievalQA.from_chain_type(llm=llm,
31
- chain_type='stuff',
32
- retriever=retriever,
33
- return_source_documents=cfg.RETURN_SOURCE_DOCUMENTS,
34
- chain_type_kwargs={'prompt': prompt})
35
-
36
- return qa_chain
37
- def setup_qa_chain():
38
- llm = setup_llm()
39
- qa_prompt = set_qa_prompt()
40
- qa_chain = build_retrieval_qa_chain(llm, qa_prompt)
41
- return qa_chain
42
-
43
-
44
-
45
-
46
- def query_embeddings(query):
47
- embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
48
- chromadb = Chroma(persist_directory="./vectorestore/db_faiss", embedding_function=embedding_function)
49
- retriever = chromadb.as_retriever(search_kwargs={'k': cfg.VECTOR_COUNT})
50
- semantic_search = retriever.similarity_search_with_relevance_scores(query)
51
- return semantic_search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
RD/AI_full_stack_repository/models/.DS_Store DELETED
Binary file (6.15 kB)
 
RD/AI_full_stack_repository/models/model_download.txt DELETED
@@ -1 +0,0 @@
1
- Download the quantized mistral-7b-instruct-v0.1.Q5_K_M.gguf model from: https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/tree/main
 
 
RD/AI_full_stack_repository/requirements.txt DELETED
@@ -1,13 +0,0 @@
1
- streamlit==1.29.0
2
- langchain==0.1.13
3
- # unstructured==0.12.3
4
- tiktoken==0.5.2
5
- pypdf==4.1.0
6
- sentence-transformers==2.5.1
7
- langchain-community
8
- langchain-chroma
9
- numpy==1.26.1
10
- python-box
11
- llama-cpp-python==0.2.76
12
- # pdfservices-sdk==4.0.0
13
- watchdog==4.0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
RD/AI_full_stack_repository/screenshot_images/.DS_Store DELETED
Binary file (6.15 kB)
 
RD/AI_full_stack_repository/screenshot_images/invoice_image.png DELETED
Binary file (220 kB)
 
RD/AI_full_stack_repository/vectorestore/.DS_Store DELETED
Binary file (6.15 kB)
 
RD/AI_full_stack_repository/vectorestore/chroma/.DS_Store DELETED
Binary file (6.15 kB)
 
RD/README.md DELETED
@@ -1,13 +0,0 @@
1
- ---
2
- title: RD
3
- emoji: 📊
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.35.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference