|
from PyPDF2 import PdfReader |
|
|
|
from tqdm import tqdm |
|
import tiktoken |
|
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings |
|
from langchain.vectorstores import Chroma |
|
import openai |
|
import streamlit as st |
|
import gradio as gr |
|
|
|
openai.api_key = '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' |
|
|
|
|
|
file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf" |
|
paragraph_length = 100 |
|
overlapping_length = 50 |
|
db = None |
|
|
|
from PyPDF2 import PdfReader |
|
|
|
|
|
def load_pdf(file_path): |
|
print("load pdf") |
|
reader = PdfReader(file_path) |
|
|
|
text = '' |
|
for page in tqdm(reader.pages): |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
def extract_text_with_format(pdf_path): |
|
with pdfplumber.open(pdf_path) as pdf: |
|
text = '' |
|
for page in tqdm(pdf.pages): |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
from collections import deque |
|
|
|
|
|
def split_text(text, paragraph_length, overlapping_length): |
|
enc = tiktoken.get_encoding("cl100k_base") |
|
enc = tiktoken.encoding_for_model("gpt-4") |
|
|
|
def get_len(tokens): |
|
return len(tokens) |
|
|
|
def tokens_to_text(tokens): |
|
return enc.decode(tokens) |
|
|
|
|
|
splitted_text = [] |
|
tokens = enc.encode(text) |
|
|
|
i = 0 |
|
while i < len(tokens): |
|
start = max(i - overlapping_length, 0) |
|
end = i + paragraph_length |
|
splitted_text.append(tokens_to_text(tokens[start:end])) |
|
i += paragraph_length |
|
|
|
return splitted_text |
|
|
|
|
|
def save_in_DB(splitted_text): |
|
|
|
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") |
|
db = Chroma.from_texts(splitted_text, embedding_function) |
|
print("Data saved successfully!") |
|
print("type db", type(db)) |
|
return db |
|
|
|
|
|
def query(query_text): |
|
st.title('RAG system') |
|
|
|
|
|
docs = db.similarity_search(query_text) |
|
print("len(docs)", len(docs)) |
|
|
|
|
|
context = '\n\n'.join([doc.page_content for doc in docs[:5]]) |
|
|
|
"""st.subheader("Context:") |
|
st.write(context)""" |
|
instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:" |
|
|
|
|
|
completion = openai.ChatCompletion.create( |
|
model="gpt-3.5-turbo", |
|
messages=[ |
|
{"role": "user", "content": instruct} |
|
], |
|
max_tokens=150 |
|
) |
|
|
|
|
|
predicted = completion.choices[0].message["content"] |
|
|
|
|
|
st.subheader("Answer:") |
|
st.write(predicted) |
|
return predicted, context |
|
|
|
|
|
|
|
def run(): |
|
global db |
|
print("run app") |
|
text = load_pdf(file_path) |
|
|
|
splitted_text = split_text(text, paragraph_length, overlapping_length) |
|
print("num splitted text", len(splitted_text)) |
|
db = save_in_DB(splitted_text) |
|
print("type db", type(db)) |
|
|
|
demo = gr.Interface(fn=query, inputs="text", outputs=["text", "text"]) |
|
|
|
demo.launch() |
|
|
|
|
|
run() |