Spaces:

Hemasagar
/

RD

Sleeping

App Files Files Community

Hemasagar commited on Jun 4, 2024

Commit

68cd524

verified ·

1 Parent(s): 65b0a64

Upload 13 files

Browse files

Files changed (13) hide show

llm/__init__.py +0 -0
llm/__pycache__/__init__.cpython-310.pyc +0 -0
llm/__pycache__/__init__.cpython-312.pyc +0 -0
llm/__pycache__/llm.cpython-310.pyc +0 -0
llm/__pycache__/llm.cpython-312.pyc +0 -0
llm/__pycache__/prompts.cpython-310.pyc +0 -0
llm/__pycache__/prompts.cpython-312.pyc +0 -0
llm/__pycache__/wrapper.cpython-310.pyc +0 -0
llm/__pycache__/wrapper.cpython-312.pyc +0 -0
llm/llm.py +29 -0
llm/prompts.py +13 -0
llm/test.py +95 -0
llm/wrapper.py +51 -0

llm/__init__.py ADDED Viewed

File without changes

llm/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (160 Bytes). View file

llm/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (164 Bytes). View file

llm/__pycache__/llm.cpython-310.pyc ADDED Viewed

Binary file (629 Bytes). View file

llm/__pycache__/llm.cpython-312.pyc ADDED Viewed

Binary file (970 Bytes). View file

llm/__pycache__/prompts.cpython-310.pyc ADDED Viewed

Binary file (454 Bytes). View file

llm/__pycache__/prompts.cpython-312.pyc ADDED Viewed

Binary file (465 Bytes). View file

llm/__pycache__/wrapper.cpython-310.pyc ADDED Viewed

Binary file (1.9 kB). View file

llm/__pycache__/wrapper.cpython-312.pyc ADDED Viewed

Binary file (2.7 kB). View file

llm/llm.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from langchain.llms import CTransformers
+import box
+import yaml
+from langchain.llms import LlamaCpp
+config={'max_new_tokens': 2000,
+        'temperature': 0.01,
+        "context_length" : 4000}
+# Import config vars
+with open('config.yml', 'r', encoding='utf8') as ymlfile:
+    cfg = box.Box(yaml.safe_load(ymlfile))
+def setup_llm():
+    # llm = CTransformers(model=cfg.MODEL_BIN_PATH,
+    #                     model_type=cfg.MODEL_TYPE,
+    #                     max_new_tokens=cfg.MAX_NEW_TOKENS,
+    #                     temperature=cfg.TEMPERATURE
+    # )
+    llm = LlamaCpp(
+    streaming = True,
+    model_path=cfg.MODEL_BIN_PATH,#"mistral-7b-instruct-v0.1.Q4_K_M.gguf",
+    temperature=0.75,
+    top_p=1,
+    verbose=True,
+    n_ctx=4096
+    )
+    return llm

llm/prompts.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Note: Precise formatting of spacing and indentation of the prompt template is important,
+# as it is highly sensitive to whitespace changes. For example, it could have problems generating
+# a summary from the pieces of context if the spacing is not done correctly
+qa_template = """Use the following pieces of information to answer the user's question.
+If you don't know the answer, just say that you don't know, don't try to make up an answer.
+Context: {context}
+Question: {question}
+Only return the helpful answer below and nothing else.
+Helpful answer:
+"""

llm/test.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# from langchain.vectorstores import Chroma
+# # from langchain_chroma import Chroma
+# from langchain_community.document_loaders import TextLoader
+# from langchain_community.embeddings.sentence_transformer import (
+#     SentenceTransformerEmbeddings,
+# )
+# from langchain.document_loaders import PyPDFDirectoryLoader
+# from langchain_text_splitters import CharacterTextSplitter
+# from langchain.text_splitter import CharacterTextSplitter
+# from langchain.text_splitter import RecursiveCharacterTextSplitter
+# import os
+# os.getcwd()
+# #Load Documents
+# def file_loader(filename):
+#     if filename.endswith('.txt'):
+#         # load the text document and split it into chunks
+#         loader = TextLoader(filename)
+#         documents = loader.load()
+#         return documents
+#     #Loads pdf files available in a directory with pypdf
+#     elif filename.endswith('.pdf'):
+#         loader = PyPDFDirectoryLoader(filename)
+#         documents = loader.load()
+#         return documents
+# filename = '/data'
+# def load_docs(directory):
+#     loader = PyPDFDirectoryLoader(directory)
+#     documents = loader.load()
+#     if not documents:
+#         raise ValueError(f"No documents loaded from directory: {directory}")
+#     return documents
+# documents = load_docs(filename)
+# print(f"Number of loaded documents: {len(documents)}")
+# # split it into chunks
+# def split_docs(documents, chunk_size=2000, chunk_overlap=20):
+#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+#     docs = text_splitter.split_documents(documents)
+#     if not docs:
+#         raise ValueError("Document splitting resulted in an empty list.")
+#     return docs
+# docs = split_docs(documents)
+# print(f"Number of document chunks: {len(docs)}")
+# # Generate text embeddings
+# #Huggingface LLM for creating Embeddings for documents/text
+# # create the open-source embedding function
+# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
+# # load it into Chroma
+# db = Chroma.from_documents(docs, embedding_function)
+# # query it
+# query = "What is invoice number?"
+# docs = db.similarity_search(query)
+# # print results
+# print(docs[0].page_content)
+#---------------------------------------------------------PDF-READER------------------------------------------------------------------
+# import easyocr
+# reader = easyocr.Reader(['en'])
+# result = reader.readtext(r'/Users/hemasagarendluri1996/llm-mistral-invoice-cpu/screenshot_images/invoice_image.png')
+# for detection in result:
+#     print(detection[1])
+import streamlit as st
+#Hello! It seems like you want to import the Streamlit library in Python. Streamlit is a powerful open-source framework used for building web applications with interactive data visualizations and machine learning models. To import Streamlit, you'll need to ensure that you have it installed in your Python environment.
+#Once you have Streamlit installed, you can import it into your Python script using the import statement,
+def main():
+    st.set_page_config(page_title="Document seemless process ")
+    st.title("Auto text extraction with AI Planet ")
+    st.subheader("I can help you in extracting text from pdf,documents ....")
+    # Upload the Invoices (pdf files)...
+    pdf = st.file_uploader("Upload invoices here for now, only PDF files allowed and will accept other formate as well", type=["pdf"],accept_multiple_files=True)
+    submit=st.button("Extract Data")
+    response = 4+5
+    if submit:
+        with st.spinner('Wait for it...'):
+            st.subheader("Answer:")
+        st.write(response)
+#Invoking main function
+if __name__ == '__main__':
+    main()

llm/wrapper.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import box
+import yaml
+from langchain.prompts import PromptTemplate
+from langchain.chains import RetrievalQA
+# from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from llm.prompts import qa_template
+from llm.llm import setup_llm
+from langchain_community.embeddings.sentence_transformer import (
+    SentenceTransformerEmbeddings,
+)
+from langchain.vectorstores import Chroma
+# Import config vars
+with open('config.yml', 'r', encoding='utf8') as ymlfile:
+    cfg = box.Box(yaml.safe_load(ymlfile))
+def set_qa_prompt():
+    """
+    Prompt template for QA retrieval for each vectorstore
+    """
+    prompt = PromptTemplate(template=qa_template,
+                            input_variables=['context', 'question'])
+    return prompt
+def build_retrieval_qa_chain(llm, prompt):
+    # create the open-source embedding function
+    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
+    # load from disk
+    chromadb = Chroma(persist_directory="./vectorestore/db_faiss", embedding_function=embedding_function)
+    retriever = chromadb.as_retriever(search_kwargs={'k': cfg.VECTOR_COUNT})
+    qa_chain = RetrievalQA.from_chain_type(llm=llm,
+                                           chain_type='stuff',
+                                           retriever=retriever,
+                                           return_source_documents=cfg.RETURN_SOURCE_DOCUMENTS,
+                                           chain_type_kwargs={'prompt': prompt})
+    return qa_chain
+def setup_qa_chain():
+    llm = setup_llm()
+    qa_prompt = set_qa_prompt()
+    qa_chain = build_retrieval_qa_chain(llm, qa_prompt)
+    return qa_chain
+def query_embeddings(query):
+    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'})
+    chromadb = Chroma(persist_directory="./vectorestore/db_faiss", embedding_function=embedding_function)
+    retriever = chromadb.as_retriever(search_kwargs={'k': cfg.VECTOR_COUNT})
+    semantic_search = retriever.similarity_search_with_relevance_scores(query)
+    return semantic_search