File size: 3,533 Bytes
eaf0e00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import streamlit as st

#for textfiles
from langchain.document_loaders import TextLoader
#text splitter
from langchain.text_splitter import CharacterTextSplitter
#for using HugginFace models & embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS
#facebook vectorization
from langchain.chains.question_answering import load_qa_chain
#load pdf
from langchain.document_loaders import UnstructuredPDFLoader

os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]

st.title('Document Q&A - Ask anything in your Document')
st.sidebar.subheader('Upload document')
uploaded_file = st.file_uploader("Upload File",type=['txt','pdf'])
# url2 = "https://github.com/fabiomatricardi/cdQnA/raw/main/KS-all-info_rev1.txt"
# res = requests.get(url2)
# with open("KS-all-info_rev1.txt", "w") as f:
#   f.write(res.text)

st.subheader('Enter query')
query = st.text_input('Ask anything about the Document you uploaded')

st.subheader('Answer')
st.write('Answer from document')

# # Document Loader
# loader = TextLoader('./KS-all-info_rev1.txt')
# documents = loader.load()
# import textwrap
# def wrap_text_preserve_newlines(text, width=110):
#     # Split the input text into lines based on newline characters
#     lines = text.split('\n')
#     # Wrap each line individually
#     wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
#     # Join the wrapped lines back together using newline characters
#     wrapped_text = '\n'.join(wrapped_lines)
#     return wrapped_text

# # Text Splitter
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
# docs = text_splitter.split_documents(documents)

# # Embeddings
# embeddings = HuggingFaceEmbeddings()

# #Create the vectorized db
# db = FAISS.from_documents(docs, embeddings)

# llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512})
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
# chain = load_qa_chain(llm2, chain_type="stuff")

# # Sample question
# # query = "What the actual issues and drawbacks ?"

# # docs = db.similarity_search(query)
# # chain.run(input_documents=docs, question=query)


# # PDFs
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf
# # !mkdir pdfs
# # !cp *pdf '/content/pdfs'

# # pdf_folder_path = '/content/pdfs'
# # os.listdir(pdf_folder_path)

# # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
# # loaders

# index = VectorstoreIndexCreator(
#     embedding=HuggingFaceEmbeddings(),
#     text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)

# #Load llm with selected one
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
# #Prepare the pipeline
# from langchain.chains import RetrievalQA
# chain = RetrievalQA.from_chain_type(llm=llm2,
#                                     chain_type="stuff",
#                                     retriever=index.vectorstore.as_retriever(),
#                                     input_key="question")
# #get reply to our questions
# # chain.run('What is the difference between a PLC and a PC?')