Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
#for textfiles | |
from langchain.document_loaders import TextLoader | |
#text splitter | |
from langchain.text_splitter import CharacterTextSplitter | |
#for using HugginFace models & embeddings | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain import HuggingFaceHub | |
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html | |
from langchain.vectorstores import FAISS | |
#facebook vectorization | |
from langchain.chains.question_answering import load_qa_chain | |
#load pdf | |
from langchain.document_loaders import UnstructuredPDFLoader | |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"] | |
st.title('Document Q&A - Ask anything in your Document') | |
st.sidebar.subheader('Upload document') | |
uploaded_file = st.file_uploader("Upload File",type=['txt','pdf']) | |
# url2 = "https://github.com/fabiomatricardi/cdQnA/raw/main/KS-all-info_rev1.txt" | |
# res = requests.get(url2) | |
# with open("KS-all-info_rev1.txt", "w") as f: | |
# f.write(res.text) | |
st.subheader('Enter query') | |
query = st.text_input('Ask anything about the Document you uploaded') | |
st.subheader('Answer') | |
st.write('Answer from document') | |
# # Document Loader | |
# loader = TextLoader('./KS-all-info_rev1.txt') | |
# documents = loader.load() | |
# import textwrap | |
# def wrap_text_preserve_newlines(text, width=110): | |
# # Split the input text into lines based on newline characters | |
# lines = text.split('\n') | |
# # Wrap each line individually | |
# wrapped_lines = [textwrap.fill(line, width=width) for line in lines] | |
# # Join the wrapped lines back together using newline characters | |
# wrapped_text = '\n'.join(wrapped_lines) | |
# return wrapped_text | |
# # Text Splitter | |
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) | |
# docs = text_splitter.split_documents(documents) | |
# # Embeddings | |
# embeddings = HuggingFaceEmbeddings() | |
# #Create the vectorized db | |
# db = FAISS.from_documents(docs, embeddings) | |
# llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512}) | |
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512}) | |
# chain = load_qa_chain(llm2, chain_type="stuff") | |
# # Sample question | |
# # query = "What the actual issues and drawbacks ?" | |
# # docs = db.similarity_search(query) | |
# # chain.run(input_documents=docs, question=query) | |
# # PDFs | |
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf | |
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf | |
# # !mkdir pdfs | |
# # !cp *pdf '/content/pdfs' | |
# # pdf_folder_path = '/content/pdfs' | |
# # os.listdir(pdf_folder_path) | |
# # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)] | |
# # loaders | |
# index = VectorstoreIndexCreator( | |
# embedding=HuggingFaceEmbeddings(), | |
# text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders) | |
# #Load llm with selected one | |
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512}) | |
# #Prepare the pipeline | |
# from langchain.chains import RetrievalQA | |
# chain = RetrievalQA.from_chain_type(llm=llm2, | |
# chain_type="stuff", | |
# retriever=index.vectorstore.as_retriever(), | |
# input_key="question") | |
# #get reply to our questions | |
# # chain.run('What is the difference between a PLC and a PC?') |