File size: 3,507 Bytes
853a403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from PyPDF2 import PdfReader
# import pdfplumber
from tqdm import tqdm
import tiktoken
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
import openai
import streamlit as st
import gradio as gr

openai.api_key = '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'

# write some python constants for file name, paragraph length, overlapping length:
file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf"
paragraph_length = 100
overlapping_length = 50
db = None

from PyPDF2 import PdfReader


def load_pdf(file_path):
    print("load pdf")
    reader = PdfReader(file_path)
    # concatenate all pages
    text = ''
    for page in tqdm(reader.pages):
        text += page.extract_text()
    return text


def extract_text_with_format(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in tqdm(pdf.pages):
            text += page.extract_text()
    return text


from collections import deque


def split_text(text, paragraph_length, overlapping_length):
    enc = tiktoken.get_encoding("cl100k_base")
    enc = tiktoken.encoding_for_model("gpt-4")

    def get_len(tokens):
        return len(tokens)

    def tokens_to_text(tokens):
        return enc.decode(tokens)

    # split text so each item is max paragraph length and overlap is overlapping length
    splitted_text = []
    tokens = enc.encode(text)

    i = 0
    while i < len(tokens):
        start = max(i - overlapping_length, 0)
        end = i + paragraph_length
        splitted_text.append(tokens_to_text(tokens[start:end]))
        i += paragraph_length

    return splitted_text


def save_in_DB(splitted_text):
    # Create the open-source embedding function
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma.from_texts(splitted_text, embedding_function)
    print("Data saved successfully!")
    print("type db", type(db))
    return db


def query(query_text):
    st.title('RAG system')

    # query_text = st.text_input("Enter your question", "Cynthia W. Harris is a citizen of which state?", key="question")
    docs = db.similarity_search(query_text)
    print("len(docs)", len(docs))

    # Store the first 10 results as context
    context = '\n\n'.join([doc.page_content for doc in docs[:5]])
    # show context in streamlit with subheader
    """st.subheader("Context:")
    st.write(context)"""
    instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:"

    # Make an OpenAI request with the given context and query
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # or any other model you're targeting
        messages=[
            {"role": "user", "content": instruct}
            ],
        max_tokens=150
        )

    # Extract the generated answer
    predicted = completion.choices[0].message["content"]

    # Return the generated answer
    st.subheader("Answer:")
    st.write(predicted)
    return predicted, context



def run():
    global db
    print("run app")
    text = load_pdf(file_path)
    # text = extract_text_with_format(file_path)
    splitted_text = split_text(text, paragraph_length, overlapping_length)
    print("num splitted text", len(splitted_text))
    db = save_in_DB(splitted_text)
    print("type db", type(db))

    demo = gr.Interface(fn=query, inputs="text", outputs=["text", "text"])

    demo.launch()
    # query(db)

run()