Spaces:

xavierbarbier
/

rag_ngap

Sleeping

xavierbarbier commited on Aug 23, 2024

Commit

6e9cf31

verified ·

1 Parent(s): 49fdcd9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -39,6 +39,17 @@ model._is_chat_session_activated = False
 max_new_tokens = 2048
 # creating a pdf reader object
 print("Finish the model init process")
@@ -51,6 +62,10 @@ p = pipeline(
     model="impira/layoutlm-document-qa",
 )
 def qa(question: str, doc: str) -> str:
     reader = PdfReader(doc)
@@ -64,8 +79,30 @@ def qa(question: str, doc: str) -> str:
     text = ' '.join(text)
-    return text
 demo = gr.Interface(

 max_new_tokens = 2048
+model_kwargs = {'device': 'cpu'}
+encode_kwargs = {'normalize_embeddings': False}
+embeddings = HuggingFaceEmbeddings(
+    model_kwargs=model_kwargs,
+    encode_kwargs=encode_kwargs
+)
+chunk_size = 2048
 # creating a pdf reader object
 print("Finish the model init process")
     model="impira/layoutlm-document-qa",
 )
+def get_text_embedding(text):
+    return embeddings.embed_query(text)
 def qa(question: str, doc: str) -> str:
     reader = PdfReader(doc)
     text = ' '.join(text)
+    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+    text_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])
+    d = text_embeddings.shape[1]
+    index = faiss.IndexFlatL2(d)
+    index.add(text_embeddings)
+    question_embeddings = np.array([get_text_embedding(question)])
+    D, I = index.search(question_embeddings, k=2) # distance, index
+    retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
+    prompt = f"""
+            Context information is below.
+            ---------------------
+            {retrieved_chunk}
+            ---------------------
+            Given the context information and not prior knowledge, answer the query.
+            Query: {question}
+            Answer:
+                """
+    return prompt
 demo = gr.Interface(