|
from PyPDF2 import PdfReader |
|
|
|
from tqdm import tqdm |
|
import tiktoken |
|
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings |
|
from langchain.vectorstores import Chroma |
|
import openai |
|
import streamlit as st |
|
import gradio as gr |
|
from gradio.components import Textbox, Slider |
|
import os |
|
|
|
|
|
openai.api_key = os.getenv("OPENAI") |
|
|
|
|
|
file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf" |
|
paragraph_length = 100 |
|
overlapping_length = 50 |
|
db = None |
|
|
|
from PyPDF2 import PdfReader |
|
|
|
|
|
def load_pdf(file_path): |
|
print("load pdf") |
|
reader = PdfReader(file_path) |
|
|
|
text = '' |
|
for page in tqdm(reader.pages): |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
def extract_text_with_format(pdf_path): |
|
with pdfplumber.open(pdf_path) as pdf: |
|
text = '' |
|
for page in tqdm(pdf.pages): |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
from collections import deque |
|
|
|
|
|
def split_text(text, paragraph_length, overlapping_length): |
|
enc = tiktoken.get_encoding("cl100k_base") |
|
enc = tiktoken.encoding_for_model("gpt-4") |
|
|
|
def get_len(tokens): |
|
return len(tokens) |
|
|
|
def tokens_to_text(tokens): |
|
return enc.decode(tokens) |
|
|
|
|
|
splitted_text = [] |
|
tokens = enc.encode(text) |
|
|
|
i = 0 |
|
while i < len(tokens): |
|
start = max(i - overlapping_length, 0) |
|
end = i + paragraph_length |
|
splitted_text.append(tokens_to_text(tokens[start:end])) |
|
i += paragraph_length |
|
|
|
return splitted_text |
|
|
|
|
|
def save_in_DB(splitted_text): |
|
|
|
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") |
|
db = Chroma.from_texts(splitted_text, embedding_function) |
|
print("Data saved successfully!") |
|
print("type db", type(db)) |
|
return db |
|
|
|
|
|
def query(query_text, num_docs): |
|
st.title('RAG system') |
|
|
|
|
|
docs = db.similarity_search(query_text, k=num_docs) |
|
print("len(docs)", len(docs)) |
|
|
|
for doc in docs: |
|
print("doc", doc.page_content) |
|
print() |
|
print() |
|
|
|
|
|
context = '\n\n'.join([doc.page_content for doc in docs[:5]]) |
|
|
|
"""st.subheader("Context:") |
|
st.write(context)""" |
|
instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:" |
|
|
|
|
|
completion = openai.ChatCompletion.create( |
|
model="gpt-3.5-turbo", |
|
messages=[ |
|
{"role": "user", "content": instruct} |
|
], |
|
max_tokens=150 |
|
) |
|
|
|
|
|
predicted = completion.choices[0].message["content"] |
|
|
|
|
|
st.subheader("Answer:") |
|
st.write(predicted) |
|
return predicted |
|
|
|
|
|
|
|
def run(): |
|
global db |
|
print("run app") |
|
text = load_pdf(file_path) |
|
|
|
splitted_text = split_text(text, paragraph_length, overlapping_length) |
|
print("num splitted text", len(splitted_text)) |
|
db = save_in_DB(splitted_text) |
|
print("type db", type(db)) |
|
|
|
demo = gr.Interface( |
|
fn=query, |
|
inputs=[ |
|
Textbox(lines=1, placeholder="Type your question here...", label="Question"), |
|
Slider(minimum=1, maximum=20, default=4, step=1, label="Number of Documents in Context") |
|
], |
|
outputs="text", |
|
theme="dark" |
|
) |
|
|
|
demo.launch() |
|
|
|
|
|
run() |