Spaces:

asyafiqe
/

pdfGPT-chat

Build error

App Files Files Community

mphycx commited on Jun 12, 2023

Commit

d3c3946

1 Parent(s): 3fcca8e

Push large file

Browse files

Files changed (22) hide show

.gitattributes +1 -0
Dockerfile +28 -0
LICENSE +21 -0
api.py +336 -0
app.py +409 -0
docker-compose.yaml +15 -0
intfloat/e5-small-v2/config.json +25 -0
intfloat/e5-small-v2/pytorch_model.bin +3 -0
intfloat/e5-small-v2/special_tokens_map.json +7 -0
intfloat/e5-small-v2/tokenizer.json +0 -0
intfloat/e5-small-v2/tokenizer_config.json +15 -0
intfloat/e5-small-v2/vocab.txt +0 -0
intfloat/multilingual-e5-base/README.md +0 -0
intfloat/multilingual-e5-base/config.json +28 -0
intfloat/multilingual-e5-base/gitattributes +34 -0
intfloat/multilingual-e5-base/pytorch_model.bin +3 -0
intfloat/multilingual-e5-base/sentencepiece.bpe.model +3 -0
intfloat/multilingual-e5-base/special_tokens_map.json +15 -0
intfloat/multilingual-e5-base/tokenizer_config.json +19 -0
requirements_api.txt +7 -0
requirements_app.txt +4 -0
requirements_pytorch.txt +4 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*tokenizer.json filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.9-slim-bullseye as langchain-serve-img
+COPY requirements_pytorch.txt requirements_pytorch.txt
+COPY requirements_api.txt requirements_api.txt
+RUN pip3 install -r requirements_pytorch.txt
+RUN pip3 install -r requirements_api.txt
+COPY api.py api.py
+EXPOSE 8080
+ENTRYPOINT [ "lc-serve", "deploy", "local", "api.py" ]
+FROM python:3.9-slim-bullseye as pdfgpt-chat-img
+COPY requirements_app.txt requirements_app.txt
+RUN pip3 install -r requirements_app.txt
+WORKDIR /app
+COPY intfloat /app/intfloat
+COPY app.py app.py
+EXPOSE 7860
+HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
+ENTRYPOINT  ["streamlit", "run", "app.py", "--server.port=7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 mphycx
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

api.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import gc
+import os
+import re
+import shutil
+import urllib.request
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+import fitz
+import numpy as np
+import openai
+import torch
+import torch.nn.functional as F
+from fastapi import UploadFile
+from lcserve import serving
+from optimum.bettertransformer import BetterTransformer
+from sklearn import svm
+from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin_min
+from torch import Tensor
+from transformers import AutoModel, AutoTokenizer
+recommender = None
+def download_pdf(url, output_path):
+    urllib.request.urlretrieve(url, output_path)
+def preprocess(text):
+    text = text.replace("-\n", "")
+    text = text.replace("\n", " ")
+    text = re.sub("\s+", " ", text)
+    return text
+def get_margin(pdf):
+    page = pdf[0]
+    page_size = page.mediabox
+    margin_hor = page.mediabox.width * 0.05
+    margin_ver = page.mediabox.height * 0.05
+    margin_size = page_size + (margin_hor, margin_ver, -margin_hor, -margin_ver)
+    return margin_size
+def pdf_to_text(path, start_page=1, end_page=None):
+    doc = fitz.open(path)
+    total_pages = doc.page_count
+    if end_page is None:
+        end_page = total_pages
+    text_list = []
+    margin_size = get_margin(doc)
+    for i in range(start_page - 1, end_page):
+        page = doc[i]
+        page.set_cropbox(margin_size)
+        text = page.get_text("text")
+        text = preprocess(text)
+        text_list.append(text)
+    doc.close()
+    return text_list
+def text_to_chunks(texts, word_length=150, start_page=1):
+    text_toks = [t.split(" ") for t in texts]
+    page_nums = []
+    chunks = []
+    for idx, words in enumerate(text_toks):
+        for i in range(0, len(words), word_length):
+            chunk = words[i : i + word_length]
+            if (
+                (i + word_length) > len(words)
+                and (len(chunk) < word_length)
+                and (len(text_toks) != (idx + 1))
+            ):
+                text_toks[idx + 1] = chunk + text_toks[idx + 1]
+                continue
+            chunk = " ".join(chunk).strip()
+            chunk = f"[Page no. {idx+start_page}]" + " " + '"' + chunk + '"'
+            chunks.append(chunk)
+    return chunks
+class SemanticSearch:
+    def __init__(self, embedding_model):
+        self.tokenizer = AutoTokenizer.from_pretrained(f"intfloat/{embedding_model}")
+        self.model = AutoModel.from_pretrained(
+            f"intfloat/{embedding_model}",
+            # cache_dir =,
+        )
+        self.model = BetterTransformer.transform(self.model, keep_original_model=True)
+        # set device
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = self.model.to(self.device)
+        self.fitted = False
+    def fit(self, data, batch_size=32, n_neighbors=5):
+        self.data = data
+        self.embeddings = self.get_text_embedding(self.data, batch_size=batch_size)
+        self.fitted = True
+    def __call__(self, text, return_data=True):
+        self.inp_emb = self.get_text_embedding([text], prefix="query")
+        self.matches = self.run_svm(self.inp_emb, self.embeddings)
+        if return_data:
+            # return 5 first match, first index is query, so it has to be skipped
+            return [self.data[i - 1] for i in self.matches[1:6]]
+        else:
+            return self.matches
+    def average_pool(
+        self, last_hidden_states: Tensor, attention_mask: Tensor
+    ) -> Tensor:
+        self.last_hidden = last_hidden_states.masked_fill(
+            ~attention_mask[..., None].bool(), 0.0
+        )
+        return self.last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+    def get_text_embedding(self, texts, prefix="passage", batch_size=32):
+        # Tokenize the input texts
+        texts = [f"{prefix}: {text}" for text in texts]
+        batch_dict = self.tokenizer(
+            texts, max_length=512, padding=True, truncation=True, return_tensors="pt"
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**batch_dict)
+        embeddings = self.average_pool(
+            outputs.last_hidden_state, batch_dict["attention_mask"]
+        )
+        # Normalize embeddings
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+        # Convert pytorch tensor to numpy array (no grad)
+        if self.device == "cuda":
+            embeddings = embeddings.detach().cpu().clone().numpy()
+        else:
+            embeddings = embeddings.detach().numpy()
+        return embeddings
+    def run_svm(self, query_emb, passage_emb):
+        joined_emb = np.concatenate((query_emb, passage_emb))
+        # create var for SVM label
+        y = np.zeros(joined_emb.shape[0])
+        # mark query as a positive example
+        y[0] = 1
+        # declare SVM
+        clf = svm.LinearSVC(
+            class_weight="balanced", verbose=False, max_iter=10000, tol=1e-6, C=0.1
+        )
+        # train (Exemplar) SVM
+        clf.fit(joined_emb, y)
+        # infer on original data
+        similarities = clf.decision_function(joined_emb)
+        sorted_ix = np.argsort(-similarities)
+        return sorted_ix
+    def summarize(self):
+        n_clusters = int(np.ceil(len(self.embeddings)**0.5))
+        # max cluster 5 (reserve token)
+        n_clusters = n_clusters if n_clusters <= 5 else 5
+        kmeans = KMeans(n_clusters=n_clusters, random_state=23)
+        kmeans = kmeans.fit(self.embeddings)
+        avg = []
+        closest = []
+        for j in range(n_clusters):
+            # find first chunk index of every cluster
+            idx = np.where(kmeans.labels_ == j)[0]
+            avg.append(np.mean(idx))
+        # find chunk that is closest to the centroid
+        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,
+                                                   self.embeddings)
+        ordering = sorted(range(n_clusters), key=lambda k: avg[k])
+        # concat representative chunks
+        summary = [self.data[i] for i in [closest[idx] for idx in ordering]]
+        return summary
+def clear_cache():
+    global recommender
+    if "recommender" in globals():
+        del recommender
+    gc.collect()
+    if torch.cuda.is_available():
+        return torch.cuda.empty_cache()
+def load_recommender(path, embedding_model, rebuild_embedding, start_page=1):
+    global recommender
+    if rebuild_embedding:
+        clear_cache()
+        recommender = None
+    if recommender is None:
+        recommender = SemanticSearch(embedding_model)
+    if recommender.fitted:
+        return "Corpus Loaded."
+    else:
+        texts = pdf_to_text(path, start_page=start_page)
+        chunks = text_to_chunks(texts, start_page=start_page)
+        recommender.fit(chunks)
+    return "Corpus Loaded."
+def generate_text(openai_key, prompt, model="gpt-3.5-turbo"):
+    openai.api_key = openai_key
+    completions = openai.ChatCompletion.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=512,
+        n=1,
+        stop=None,
+        temperature=0.7,
+    )
+    message = f"{prompt}###{completions.choices[0].message.content}###{completions.usage.total_tokens}###{completions.model}"
+    return message
+def generate_answer(question, gpt_model, openai_key):
+    topn_chunks = recommender(question)
+    prompt = ""
+    prompt += "search results:\n\n"
+    for c in topn_chunks:
+        prompt += c + "\n\n"
+    prompt += (
+        "Instructions: Compose a comprehensive reply to the query using the search results given. "
+        "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "
+        "Citation should be done at the end of each sentence. If the search results mention multiple subjects "
+        "with the same name, create separate answers for each. Only include information found in the results and "
+        "don't add any additional information. Make sure the answer is correct and don't output false content. "
+        "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "
+        "search results which has nothing to do with the question. Only answer what is asked. The "
+        "answer should be short and concise. Answer step-by-step.\n\n"
+    )
+    prompt += f"Query: {question}"
+    answer = generate_text(openai_key, prompt, gpt_model)
+    return answer
+def generate_summary(gpt_model, openai_key):
+    topn_chunks = recommender.summarize()
+    prompt = ""
+    prompt += (
+        "Summarize the highlights of the search results and output a summary in bulletpoints. "
+        "Do not write anything before the bulletpoints. "
+        "Cite each reference using [Page no.] notation (every result has this number at the beginning). "
+        "Citation should be done at the end of each sentence. "
+        "Give conclusion in the end. "
+        "Write summary in the same language as the search results. "
+        "Search results:\n\n"
+    )
+    for c in topn_chunks:
+        prompt += c + "\n\n"
+    summary = generate_text(openai_key, prompt, gpt_model)
+    return summary
+def load_openai_key() -> str:
+    key = os.environ.get("OPENAI_API_KEY")
+    if key is None:
+        raise ValueError(
+            "[ERROR]: Please pass your OPENAI_API_KEY. Get your key here : https://platform.openai.com/account/api-keys"
+        )
+    return key
+# %%
+@serving
+def ask_url(
+    url: str,
+    question: str,
+    rebuild_embedding: bool,
+    embedding_model: str,
+    gpt_model: str,
+) -> str:
+    if rebuild_embedding:
+        load_url(url, embedding_model, rebuild_embedding)
+    openai_key = load_openai_key()
+    return generate_answer(question, gpt_model, openai_key)
+@serving
+async def ask_file(
+    file: UploadFile,
+    question: str,
+    rebuild_embedding: bool,
+    embedding_model: str,
+    gpt_model: str,
+) -> str:
+    if rebuild_embedding:
+        load_file(file, embedding_model, rebuild_embedding)
+    openai_key = load_openai_key()
+    return generate_answer(question, gpt_model, openai_key)
+@serving
+def load_url(url: str,
+             embedding_model: str,
+             rebuild_embedding: bool,
+             gpt_model: str
+             ) -> str:
+    download_pdf(url, "corpus.pdf")
+    notification = load_recommender("corpus.pdf", embedding_model, rebuild_embedding)
+    openai_key = load_openai_key()
+    summary = generate_summary(gpt_model, openai_key)
+    response = f"{notification}###{summary}"
+    return response
+@serving
+async def load_file(
+    file: UploadFile,
+        embedding_model: str,
+        rebuild_embedding: bool,
+        gpt_model: str
+) -> str:
+    suffix = Path(file.filename).suffix
+    with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = Path(tmp.name)
+    notification = load_recommender(str(tmp_path), embedding_model, rebuild_embedding)
+    openai_key = load_openai_key()
+    summary = generate_summary(gpt_model, openai_key)
+    response = f"{notification}###{summary}"
+    return response

app.py ADDED Viewed

	@@ -0,0 +1,409 @@

+# %%
+import os
+import json
+import urllib.parse
+from tempfile import _TemporaryFileWrapper
+import pandas as pd
+import requests
+import streamlit as st
+from streamlit_chat import message
+from streamlit_extras.add_vertical_space import add_vertical_space
+from streamlit_extras.colored_header import colored_header
+st.set_page_config(
+    layout="wide",
+    page_title="pdfGPT-chat. Ask your PDF!",
+    page_icon=":robot_face:",
+)
+def main():
+    @st.cache_data
+    def convert_df(df):
+        return df.to_csv(index=False).encode("utf-8")
+    def pdf_change():
+        st.session_state["pdf_change"] = True
+    def check_api(api_key):
+        return api_key.startswith("sk-") and len(api_key) == 51
+    def check_url(url):
+        parsed_url = urllib.parse.urlparse(url)
+        return all([parsed_url.scheme, parsed_url.netloc])
+    def result_to_dict(r, start):
+        result = r.json()["result"]
+        result = result.split("###")[start:]
+        keys = ["prompt", "answer", "token_used", "gpt_model"]
+        # Error in OpenAI server also gives status_code 200
+        if len(result) >= 0:
+            result.extend([result, 0, gpt_model])
+        return dict(zip(keys, result))
+    def load_pdf():
+        if file is None and len(pdf_url) == 0:
+            return st.error("Both URL and PDF is empty. Provide at least one.")
+        elif len(pdf_url) > 0:
+            if not check_url(pdf_url):
+                return st.error("Please enter valid URL.")
+            elif file is not None:
+                return st.error(
+                    "Both URL and PDF is provided. Please provide only one (either URL or PDF)."
+                )
+            # load pdf from url
+            else:
+                r = requests.post(
+                    f"{LCSERVE_HOST}/load_url",
+                    json={
+                        "url": pdf_url,
+                        "rebuild_embedding": st.session_state["pdf_change"],
+                        "embedding_model": embedding_model,
+                        "gpt_model": gpt_model,
+                        "envs": {
+                            "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
+                        }
+                    },
+                )
+        # load file
+        else:
+            _data = {
+                "rebuild_embedding": st.session_state["pdf_change"],
+                "embedding_model": embedding_model,
+                "gpt_model": gpt_model,
+                "envs": {
+                    "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
+                }
+            }
+            r = requests.post(
+                f"{LCSERVE_HOST}/load_file",
+                params={"input_data": json.dumps(_data)},
+                files={"file": file},
+            )
+        if r.status_code != 200:
+            if "error" in r.json():
+                if "message" in r.json()["error"]:
+                    return st.error(r.json()["error"]["message"])
+            else:
+                return str(r.json())
+        elif r.json()["result"].startswith("Corpus Loaded."):
+            st.session_state["loaded"] = True
+            st.session_state["pdf_change"] = False
+            # extract result
+            result = result_to_dict(r, 1)
+            # concatenate reply
+            reply_summary = "Hello there. I'm **pdfGPT-chat**.\nHere is a **summary** of your PDF:\n\n"
+            reply_summary += result["answer"]
+            reply_summary += "\n\nDo you have any **question** about your PDF?"
+            if len(st.session_state["past"]) == 1:
+                st.session_state["generated"][0] = reply_summary
+            else:
+                st.session_state["past"].append("Hi")
+                st.session_state["generated"].append(reply_summary)
+            # calculate cost
+            calculate_cost(result["token_used"], result["gpt_model"])
+            return st.success("The PDF file has been loaded.")
+        else:
+            return st.info(r.json()["result"])
+    def generate_response(
+        lcserve_host: str,
+        url: str,
+        file: _TemporaryFileWrapper,
+        question: str,
+        openai_key: str,
+    ) -> dict:
+        if question.strip() == "":
+            return "[ERROR]: Question field is empty"
+        _data = {
+            "question": question,
+            "rebuild_embedding": st.session_state["pdf_change"],
+            "embedding_model": embedding_model,
+            "gpt_model": gpt_model,
+            "envs": {
+                "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
+            },
+        }
+        if url.strip() != "":
+            r = requests.post(
+                f"{LCSERVE_HOST}/ask_url",
+                json={"url": url, **_data},
+            )
+        else:
+            r = requests.post(
+                f"{LCSERVE_HOST}/ask_file",
+                params={"input_data": json.dumps(_data)},
+                files={"file": file},
+            )
+        if r.status_code != 200:
+            content = r.content.decode()  # Convert bytes to string
+            with open("langchainlog.txt", "w") as file:
+                file.write(content)
+            return f"[ERROR]: {r.text}"
+        result_dict = result_to_dict(r, 0)
+        return result_dict
+    def calculate_cost(token_used, gpt_model):
+        st.session_state["total_token"] += int(token_used)
+        if "gpt-3" in gpt_model:
+            current_cost = st.session_state["total_token"] * 0.002 / 1000
+        else:
+            current_cost = st.session_state["total_token"] * 0.06 / 1000
+        st.session_state["total_cost"] += current_cost
+    # %%
+    # main page layout
+    header = st.container()
+    welcome_page = st.container()
+    response_container = st.container()
+    input_container = st.container()
+    cost_container = st.container()
+    load_pdf_popup = st.container()
+    # sidebar layout
+    input_details = st.sidebar.container()
+    preferences = st.sidebar.container()
+    chat_download = st.sidebar.container()
+    # %%
+    # instantiate session states
+    if "api_key" not in st.session_state:
+        st.session_state["api_key"] = False
+    if "generated" not in st.session_state:
+        st.session_state["generated"] = ["Hello there. I'm pdfGPT-chat. Do you have any question about your PDF?"]
+    if "loaded" not in st.session_state:
+        st.session_state["loaded"] = False
+    if "past" not in st.session_state:
+        st.session_state["past"] = ["Hi"]
+    if "pdf_change" not in st.session_state:
+        st.session_state["pdf_change"] = True
+    if "total_cost" not in st.session_state:
+        st.session_state["total_cost"] = 0
+    if "total_token" not in st.session_state:
+        st.session_state["total_token"] = 0
+    # %%
+    # constants
+    E5_URL = "https://github.com/microsoft/unilm/tree/master/e5"
+    EMBEDDING_CHOICES = {
+        "multilingual-e5-base": "Multilingual-E5 (default)",
+        "e5-small-v2": "English-E5-small (faster)",
+    }
+    GPT_CHOICES = {
+        "gpt-3.5-turbo": "GPT-3.5-turbo (default)",
+        "gpt-4": "GPT-4 (smarter, costlier)",
+    }
+    LCSERVE_HOST = "http://localhost:8080"
+    PDFGPT_URL = "https://github.com/bhaskatripathi/pdfGPT"
+    SIGNATURE = """<style>
+.footer {
+position: static;
+left: 0;
+bottom: 0;
+width: 100%;
+background: rgba(0,0,0,0);
+text-align: center;
+}
+</style>
+<div class="footer">
+<p style='display: block;
+text-align: center;
+font-size:14px;
+color:darkgray'>Developed with ❤ by asyafiqe</p>
+</div>
+"""
+    with header:
+        st.title(":page_facing_up: pdfGPT-chat")
+        with st.expander(
+            "A fork of [pdfGPT](%s) with several improvements. With pdfGPT-chat, you can chat with your PDF files using [**Microsoft E5 Multilingual Text Embeddings**](%s) and **OpenAI**."
+            % (PDFGPT_URL, E5_URL)
+        ):
+            st.markdown(
+                "Compared to other tools, pdfGPT-chat provides **hallucinations-free** response, thanks to its superior embeddings and tailored prompt.<br />The generated responses from pdfGPT-chat include **citations** in square brackets ([]), indicating the **page numbers** where the relevant information is found.<br />This feature not only enhances the credibility of the responses but also aids in swiftly locating the pertinent information within the PDF file.",
+                unsafe_allow_html=True,
+            )
+        colored_header(
+            label="",
+            description="",
+            color_name="blue-40",
+        )
+    with preferences:
+        colored_header(
+            label="",
+            description="",
+            color_name="blue-40",
+        )
+        st.write("**Preferences**")
+        embedding_model = st.selectbox(
+            "Embedding",
+            EMBEDDING_CHOICES.keys(),
+            help="""[Multilingual-E5](%s) supports 100 languages.
+            E5-small is much faster and suitable for PC without GPU."""
+            % E5_URL,
+            on_change=pdf_change,
+            format_func=lambda x: EMBEDDING_CHOICES[x],
+        )
+        gpt_model = st.selectbox(
+            "GPT Model",
+            GPT_CHOICES.keys(),
+            help="For GPT-4 you might have to join the waitlist: https://openai.com/waitlist/gpt-4-api",
+            format_func=lambda x: GPT_CHOICES[x],
+        )
+    # %%
+    # sidebar
+    with input_details:
+        # sidebar
+        st.title("Input details")
+        OPENAI_URL = "https://platform.openai.com/account/api-keys"
+        openai_key = st.text_input(
+            ":key: Enter your OpenAI API key here",
+            type="password",
+            help="Get your Open AI API key [here](%s)" % OPENAI_URL,
+        )
+        colored_header(
+            label="",
+            description="",
+            color_name="blue-40",
+        )
+        pdf_url = st.text_input(
+            ":globe_with_meridians: Enter PDF URL here", on_change=pdf_change
+        )
+        st.markdown(
+            "<h2 style='text-align: center; color: black;'>OR</h2>",
+            unsafe_allow_html=True,
+        )
+        file = st.file_uploader(
+            ":page_facing_up: Upload your PDF/ Research Paper / Book here",
+            type=["pdf"],
+            on_change=pdf_change,
+        )
+        if st.button("Load PDF"):
+            st.session_state["loaded"] = True
+            with st.spinner("Loading PDF"):
+                with load_pdf_popup:
+                    load_pdf()
+    # %%
+    # main tab
+    if st.session_state["loaded"]:
+        with input_container:
+            with st.form(key="input_form", clear_on_submit=True):
+                user_input = st.text_area("Question:", key="input", height=100)
+                submit_button = st.form_submit_button(label="Send")
+            if user_input and submit_button:
+                with st.spinner("Processing your question"):
+                    response = generate_response(
+                        LCSERVE_HOST,
+                        pdf_url,
+                        file,
+                        user_input,
+                        os.environ.get("OPENAI_API_KEY"),
+                    )
+                    st.session_state.past.append(user_input)
+                    st.session_state.generated.append(response["answer"])
+                    # calculate cost
+                    calculate_cost(response["token_used"], response["gpt_model"])
+            if not user_input and submit_button:
+                st.error("Please write your question.")
+        with response_container:
+            if st.session_state["generated"]:
+                for i in range(len(st.session_state["generated"])):
+                    message(
+                        st.session_state["past"][i], is_user=True, key=str(i) + "_user"
+                    )
+                    message(st.session_state["generated"][i], key=str(i))
+        cost_container.caption(
+            f"Estimated cost: $ {st.session_state['total_cost']:.4f}"
+        )
+    else:
+        with welcome_page:
+            st.write("")
+            st.subheader(
+                """:arrow_left: To start please fill input details in the sidebar and click **Load PDF**"""
+            )
+    # %%
+    # placed in the end to include the last conversation
+    with chat_download:
+        chat_history = pd.DataFrame(
+            {
+                "Question": st.session_state["past"],
+                "Answer": st.session_state["generated"],
+            }
+        )
+        csv = convert_df(chat_history)
+        st.download_button(
+            label="Download chat history",
+            data=csv,
+            file_name="chat history.csv",
+            mime="text/csv",
+        )
+        add_vertical_space(2)
+        st.markdown(SIGNATURE, unsafe_allow_html=True)
+    # %%
+    # # javascript
+    #
+    # # scroll halfway through the page
+    js = f"""
+    <script>
+    function scroll() {{
+    var textAreas = parent.document.querySelectorAll('section.main');
+    var halfwayScroll = 0.4 * textAreas[0].scrollHeight; // Calculate halfway scroll position
+    for (let index = 0; index < textAreas.length; index++) {{
+    textAreas[index].scrollTop = halfwayScroll; // Set scroll position to halfway
+    }}
+    }}
+    scroll(); // Call the scroll function
+    </script>
+    """
+    st.components.v1.html(js)
+    # reduce main top padding
+    st.markdown(
+        "<style>div.block-container{padding-top:1.5em;}</style>",
+        unsafe_allow_html=True,
+    )
+    # reduce sidebar top padding
+    st.markdown(
+        "<style>.css-ysnqb2.e1g8pov64 {margin-top: -90px;}</style>",
+        unsafe_allow_html=True,
+    )
+if __name__ == "__main__":
+    main()

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+version: '3'
+services:
+  langchain-serve:
+    build:
+      context: .
+      target: langchain-serve-img
+    ports:
+      - '8080:8080'
+  pdf-gpt:
+    build:
+      context: .
+      target: pdf-gpt-img
+    ports:
+      - '7860:7860'

intfloat/e5-small-v2/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "tmp/",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.29.0.dev0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

intfloat/e5-small-v2/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4790fed2919e70bff573d01cd3aede75970f219ab4c0b0aeadd0f4b98084a17d
+size 133508397

intfloat/e5-small-v2/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

intfloat/e5-small-v2/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

intfloat/e5-small-v2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

intfloat/e5-small-v2/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

intfloat/multilingual-e5-base/README.md ADDED Viewed

The diff for this file is too large to render. See raw diff

intfloat/multilingual-e5-base/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "tmp/",
+  "architectures": [
+    "XLMRobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.29.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

intfloat/multilingual-e5-base/gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

intfloat/multilingual-e5-base/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f061cb7641880f52895cbacab7c4ab39b0844e2e6b73794f2798de460d9fa418
+size 1112242989

intfloat/multilingual-e5-base/sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

intfloat/multilingual-e5-base/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

intfloat/multilingual-e5-base/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}

requirements_api.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi==0.96.0
+langchain_serve==0.0.41
+openai==0.27.7
+optimum==1.8.6
+PyMuPDF==1.22.3
+scikit_learn==1.0.2
+transformers==4.29.2

requirements_app.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pandas==2.0.2
+streamlit==1.23.1
+streamlit_chat==0.0.2.2
+streamlit_extras==0.2.7

requirements_pytorch.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+--extra-index-url https://download.pytorch.org/whl/cu117
+torch
+torchvision
+torchaudio