import os import streamlit as st #for textfiles from langchain.document_loaders import TextLoader #text splitter from langchain.text_splitter import CharacterTextSplitter #for using HugginFace models & embeddings from langchain.embeddings import HuggingFaceEmbeddings from langchain import HuggingFaceHub # Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html from langchain.vectorstores import FAISS #facebook vectorization from langchain.chains.question_answering import load_qa_chain #load pdf from langchain.document_loaders import UnstructuredPDFLoader os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"] st.title('Document Q&A - Ask anything in your Document') st.sidebar.subheader('Upload document') uploaded_file = st.file_uploader("Upload File",type=['txt','pdf']) # url2 = "https://github.com/fabiomatricardi/cdQnA/raw/main/KS-all-info_rev1.txt" # res = requests.get(url2) # with open("KS-all-info_rev1.txt", "w") as f: # f.write(res.text) st.subheader('Enter query') query = st.text_input('Ask anything about the Document you uploaded') st.subheader('Answer') st.write('Answer from document') # # Document Loader # loader = TextLoader('./KS-all-info_rev1.txt') # documents = loader.load() # import textwrap # def wrap_text_preserve_newlines(text, width=110): # # Split the input text into lines based on newline characters # lines = text.split('\n') # # Wrap each line individually # wrapped_lines = [textwrap.fill(line, width=width) for line in lines] # # Join the wrapped lines back together using newline characters # wrapped_text = '\n'.join(wrapped_lines) # return wrapped_text # # Text Splitter # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) # docs = text_splitter.split_documents(documents) # # Embeddings # embeddings = HuggingFaceEmbeddings() # #Create the vectorized db # db = FAISS.from_documents(docs, embeddings) # llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512}) # llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512}) # chain = load_qa_chain(llm2, chain_type="stuff") # # Sample question # # query = "What the actual issues and drawbacks ?" # # docs = db.similarity_search(query) # # chain.run(input_documents=docs, question=query) # # PDFs # # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf # # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf # # !mkdir pdfs # # !cp *pdf '/content/pdfs' # # pdf_folder_path = '/content/pdfs' # # os.listdir(pdf_folder_path) # # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)] # # loaders # index = VectorstoreIndexCreator( # embedding=HuggingFaceEmbeddings(), # text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders) # #Load llm with selected one # llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512}) # #Prepare the pipeline # from langchain.chains import RetrievalQA # chain = RetrievalQA.from_chain_type(llm=llm2, # chain_type="stuff", # retriever=index.vectorstore.as_retriever(), # input_key="question") # #get reply to our questions # # chain.run('What is the difference between a PLC and a PC?')