Hemasagar commited on
Commit
68cd524
·
verified ·
1 Parent(s): 65b0a64

Upload 13 files

Browse files
llm/__init__.py ADDED
File without changes
llm/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (160 Bytes). View file
 
llm/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (164 Bytes). View file
 
llm/__pycache__/llm.cpython-310.pyc ADDED
Binary file (629 Bytes). View file
 
llm/__pycache__/llm.cpython-312.pyc ADDED
Binary file (970 Bytes). View file
 
llm/__pycache__/prompts.cpython-310.pyc ADDED
Binary file (454 Bytes). View file
 
llm/__pycache__/prompts.cpython-312.pyc ADDED
Binary file (465 Bytes). View file
 
llm/__pycache__/wrapper.cpython-310.pyc ADDED
Binary file (1.9 kB). View file
 
llm/__pycache__/wrapper.cpython-312.pyc ADDED
Binary file (2.7 kB). View file
 
llm/llm.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.llms import CTransformers
2
+ import box
3
+ import yaml
4
+ from langchain.llms import LlamaCpp
5
+ config={'max_new_tokens': 2000,
6
+ 'temperature': 0.01,
7
+ "context_length" : 4000}
8
+ # Import config vars
9
+ with open('config.yml', 'r', encoding='utf8') as ymlfile:
10
+ cfg = box.Box(yaml.safe_load(ymlfile))
11
+
12
+
13
+ def setup_llm():
14
+ # llm = CTransformers(model=cfg.MODEL_BIN_PATH,
15
+ # model_type=cfg.MODEL_TYPE,
16
+ # max_new_tokens=cfg.MAX_NEW_TOKENS,
17
+ # temperature=cfg.TEMPERATURE
18
+ # )
19
+ llm = LlamaCpp(
20
+ streaming = True,
21
+ model_path=cfg.MODEL_BIN_PATH,#"mistral-7b-instruct-v0.1.Q4_K_M.gguf",
22
+ temperature=0.75,
23
+ top_p=1,
24
+ verbose=True,
25
+ n_ctx=4096
26
+ )
27
+
28
+
29
+ return llm
llm/prompts.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Note: Precise formatting of spacing and indentation of the prompt template is important,
2
+ # as it is highly sensitive to whitespace changes. For example, it could have problems generating
3
+ # a summary from the pieces of context if the spacing is not done correctly
4
+
5
+ qa_template = """Use the following pieces of information to answer the user's question.
6
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
7
+
8
+ Context: {context}
9
+ Question: {question}
10
+
11
+ Only return the helpful answer below and nothing else.
12
+ Helpful answer:
13
+ """
llm/test.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from langchain.vectorstores import Chroma
2
+ # # from langchain_chroma import Chroma
3
+ # from langchain_community.document_loaders import TextLoader
4
+ # from langchain_community.embeddings.sentence_transformer import (
5
+ # SentenceTransformerEmbeddings,
6
+ # )
7
+ # from langchain.document_loaders import PyPDFDirectoryLoader
8
+ # from langchain_text_splitters import CharacterTextSplitter
9
+ # from langchain.text_splitter import CharacterTextSplitter
10
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ # import os
12
+ # os.getcwd()
13
+
14
+ # #Load Documents
15
+ # def file_loader(filename):
16
+ # if filename.endswith('.txt'):
17
+ # # load the text document and split it into chunks
18
+ # loader = TextLoader(filename)
19
+ # documents = loader.load()
20
+ # return documents
21
+ # #Loads pdf files available in a directory with pypdf
22
+ # elif filename.endswith('.pdf'):
23
+ # loader = PyPDFDirectoryLoader(filename)
24
+ # documents = loader.load()
25
+ # return documents
26
+ # filename = '/data'
27
+ # def load_docs(directory):
28
+ # loader = PyPDFDirectoryLoader(directory)
29
+ # documents = loader.load()
30
+ # if not documents:
31
+ # raise ValueError(f"No documents loaded from directory: {directory}")
32
+ # return documents
33
+ # documents = load_docs(filename)
34
+ # print(f"Number of loaded documents: {len(documents)}")
35
+
36
+ # # split it into chunks
37
+ # def split_docs(documents, chunk_size=2000, chunk_overlap=20):
38
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
39
+ # docs = text_splitter.split_documents(documents)
40
+ # if not docs:
41
+ # raise ValueError("Document splitting resulted in an empty list.")
42
+ # return docs
43
+ # docs = split_docs(documents)
44
+ # print(f"Number of document chunks: {len(docs)}")
45
+
46
+
47
+ # # Generate text embeddings
48
+ # #Huggingface LLM for creating Embeddings for documents/text
49
+
50
+ # # create the open-source embedding function
51
+ # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
52
+
53
+ # # load it into Chroma
54
+ # db = Chroma.from_documents(docs, embedding_function)
55
+
56
+ # # query it
57
+ # query = "What is invoice number?"
58
+ # docs = db.similarity_search(query)
59
+
60
+ # # print results
61
+ # print(docs[0].page_content)
62
+
63
+ #---------------------------------------------------------PDF-READER------------------------------------------------------------------
64
+ # import easyocr
65
+ # reader = easyocr.Reader(['en'])
66
+ # result = reader.readtext(r'/Users/hemasagarendluri1996/llm-mistral-invoice-cpu/screenshot_images/invoice_image.png')
67
+ # for detection in result:
68
+ # print(detection[1])
69
+ import streamlit as st
70
+
71
+ #Hello! It seems like you want to import the Streamlit library in Python. Streamlit is a powerful open-source framework used for building web applications with interactive data visualizations and machine learning models. To import Streamlit, you'll need to ensure that you have it installed in your Python environment.
72
+ #Once you have Streamlit installed, you can import it into your Python script using the import statement,
73
+ def main():
74
+
75
+ st.set_page_config(page_title="Document seemless process ")
76
+ st.title("Auto text extraction with AI Planet ")
77
+ st.subheader("I can help you in extracting text from pdf,documents ....")
78
+
79
+
80
+ # Upload the Invoices (pdf files)...
81
+ pdf = st.file_uploader("Upload invoices here for now, only PDF files allowed and will accept other formate as well", type=["pdf"],accept_multiple_files=True)
82
+
83
+ submit=st.button("Extract Data")
84
+ response = 4+5
85
+ if submit:
86
+ with st.spinner('Wait for it...'):
87
+ st.subheader("Answer:")
88
+ st.write(response)
89
+
90
+
91
+
92
+
93
+ #Invoking main function
94
+ if __name__ == '__main__':
95
+ main()
llm/wrapper.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import box
2
+ import yaml
3
+ from langchain.prompts import PromptTemplate
4
+ from langchain.chains import RetrievalQA
5
+ # from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from llm.prompts import qa_template
8
+ from llm.llm import setup_llm
9
+ from langchain_community.embeddings.sentence_transformer import (
10
+ SentenceTransformerEmbeddings,
11
+ )
12
+ from langchain.vectorstores import Chroma
13
+ # Import config vars
14
+ with open('config.yml', 'r', encoding='utf8') as ymlfile:
15
+ cfg = box.Box(yaml.safe_load(ymlfile))
16
+ def set_qa_prompt():
17
+ """
18
+ Prompt template for QA retrieval for each vectorstore
19
+ """
20
+ prompt = PromptTemplate(template=qa_template,
21
+ input_variables=['context', 'question'])
22
+ return prompt
23
+
24
+ def build_retrieval_qa_chain(llm, prompt):
25
+ # create the open-source embedding function
26
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
27
+ # load from disk
28
+ chromadb = Chroma(persist_directory="./vectorestore/db_faiss", embedding_function=embedding_function)
29
+ retriever = chromadb.as_retriever(search_kwargs={'k': cfg.VECTOR_COUNT})
30
+ qa_chain = RetrievalQA.from_chain_type(llm=llm,
31
+ chain_type='stuff',
32
+ retriever=retriever,
33
+ return_source_documents=cfg.RETURN_SOURCE_DOCUMENTS,
34
+ chain_type_kwargs={'prompt': prompt})
35
+
36
+ return qa_chain
37
+ def setup_qa_chain():
38
+ llm = setup_llm()
39
+ qa_prompt = set_qa_prompt()
40
+ qa_chain = build_retrieval_qa_chain(llm, qa_prompt)
41
+ return qa_chain
42
+
43
+
44
+
45
+
46
+ def query_embeddings(query):
47
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
48
+ chromadb = Chroma(persist_directory="./vectorestore/db_faiss", embedding_function=embedding_function)
49
+ retriever = chromadb.as_retriever(search_kwargs={'k': cfg.VECTOR_COUNT})
50
+ semantic_search = retriever.similarity_search_with_relevance_scores(query)
51
+ return semantic_search