Spaces:

vr18
/

legal-rag

Runtime error

App Files Files Community

vr18 commited on Oct 11, 2023

Commit

853a403

1 Parent(s): 93d6074

Upload app.py

Browse files

Files changed (1) hide show

app.py +124 -0

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from PyPDF2 import PdfReader
+# import pdfplumber
+from tqdm import tqdm
+import tiktoken
+from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain.vectorstores import Chroma
+import openai
+import streamlit as st
+import gradio as gr
+openai.api_key = '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'
+# write some python constants for file name, paragraph length, overlapping length:
+file_path = "data/Hair-Relaxer-Master-Complaint-1.pdf"
+paragraph_length = 100
+overlapping_length = 50
+db = None
+from PyPDF2 import PdfReader
+def load_pdf(file_path):
+    print("load pdf")
+    reader = PdfReader(file_path)
+    # concatenate all pages
+    text = ''
+    for page in tqdm(reader.pages):
+        text += page.extract_text()
+    return text
+def extract_text_with_format(pdf_path):
+    with pdfplumber.open(pdf_path) as pdf:
+        text = ''
+        for page in tqdm(pdf.pages):
+            text += page.extract_text()
+    return text
+from collections import deque
+def split_text(text, paragraph_length, overlapping_length):
+    enc = tiktoken.get_encoding("cl100k_base")
+    enc = tiktoken.encoding_for_model("gpt-4")
+    def get_len(tokens):
+        return len(tokens)
+    def tokens_to_text(tokens):
+        return enc.decode(tokens)
+    # split text so each item is max paragraph length and overlap is overlapping length
+    splitted_text = []
+    tokens = enc.encode(text)
+    i = 0
+    while i < len(tokens):
+        start = max(i - overlapping_length, 0)
+        end = i + paragraph_length
+        splitted_text.append(tokens_to_text(tokens[start:end]))
+        i += paragraph_length
+    return splitted_text
+def save_in_DB(splitted_text):
+    # Create the open-source embedding function
+    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    db = Chroma.from_texts(splitted_text, embedding_function)
+    print("Data saved successfully!")
+    print("type db", type(db))
+    return db
+def query(query_text):
+    st.title('RAG system')
+    # query_text = st.text_input("Enter your question", "Cynthia W. Harris is a citizen of which state?", key="question")
+    docs = db.similarity_search(query_text)
+    print("len(docs)", len(docs))
+    # Store the first 10 results as context
+    context = '\n\n'.join([doc.page_content for doc in docs[:5]])
+    # show context in streamlit with subheader
+    """st.subheader("Context:")
+    st.write(context)"""
+    instruct = f"The following is a context from various documents:\n{context}\n\nQuestion: {query_text}\nAnswer:"
+    # Make an OpenAI request with the given context and query
+    completion = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",  # or any other model you're targeting
+        messages=[
+            {"role": "user", "content": instruct}
+            ],
+        max_tokens=150
+        )
+    # Extract the generated answer
+    predicted = completion.choices[0].message["content"]
+    # Return the generated answer
+    st.subheader("Answer:")
+    st.write(predicted)
+    return predicted, context
+def run():
+    global db
+    print("run app")
+    text = load_pdf(file_path)
+    # text = extract_text_with_format(file_path)
+    splitted_text = split_text(text, paragraph_length, overlapping_length)
+    print("num splitted text", len(splitted_text))
+    db = save_in_DB(splitted_text)
+    print("type db", type(db))
+    demo = gr.Interface(fn=query, inputs="text", outputs=["text", "text"])
+    demo.launch()
+    # query(db)
+run()