Spaces:
Running
Running
Update dspy_qa.py
Browse files- dspy_qa.py +13 -12
dspy_qa.py
CHANGED
@@ -4,11 +4,12 @@ import dspy
|
|
4 |
from dsp.utils import deduplicate
|
5 |
from dspy.retrieve.faiss_rm import FaissRM
|
6 |
from langchain_community.document_loaders import PyPDFLoader
|
7 |
-
from langchain_community.document_loaders import CSVLoader
|
8 |
-
|
9 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
10 |
|
11 |
-
|
|
|
|
|
|
|
12 |
|
13 |
class GenerateSearchQuery(dspy.Signature):
|
14 |
"""Write a simple search query that will help answer a complex question."""
|
@@ -32,9 +33,9 @@ class DocQA(dspy.Module):
|
|
32 |
def __init__(self, file_path,passages_per_hop=3, max_hops=2):
|
33 |
super().__init__()
|
34 |
self.cache = "cache.json"
|
35 |
-
self.llm = dspy.AzureOpenAI(api_base=
|
36 |
-
api_version=
|
37 |
-
model="GPT-
|
38 |
|
39 |
self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
|
40 |
self.retrieve = dspy.Retrieve(k=passages_per_hop)
|
@@ -44,14 +45,14 @@ class DocQA(dspy.Module):
|
|
44 |
self.knowledge_base = self.create_knowledge_base(file_path)
|
45 |
|
46 |
def load_documents(self, file_path):
|
47 |
-
print("file_path", file_path)
|
48 |
-
loader =
|
49 |
documents = loader.load()
|
50 |
return documents
|
51 |
|
52 |
def split_documents(self, documents):
|
53 |
text_splitter = RecursiveCharacterTextSplitter(
|
54 |
-
chunk_size=
|
55 |
chunk_overlap=0,
|
56 |
length_function=len,
|
57 |
is_separator_regex=False,
|
@@ -59,11 +60,11 @@ class DocQA(dspy.Module):
|
|
59 |
|
60 |
docs = text_splitter.split_documents(documents)
|
61 |
document_chunks = [page_content.page_content for page_content in docs]
|
62 |
-
print("input context Ready")
|
63 |
return document_chunks
|
64 |
|
65 |
def create_knowledge_base(self, file_path):
|
66 |
-
print("file_path", file_path)
|
67 |
document = self.load_documents(file_path)
|
68 |
split_documents = self.split_documents(document)
|
69 |
knowledge_base = FaissRM(split_documents)
|
@@ -72,8 +73,8 @@ class DocQA(dspy.Module):
|
|
72 |
def run(self,question):
|
73 |
dspy.settings.configure(lm=self.llm, rm=self.knowledge_base)
|
74 |
|
75 |
-
|
76 |
passages = self.retrieve(question).passages
|
|
|
77 |
context = deduplicate(passages)
|
78 |
|
79 |
pred = self.generate_answer(context=context, question=question)
|
|
|
4 |
from dsp.utils import deduplicate
|
5 |
from dspy.retrieve.faiss_rm import FaissRM
|
6 |
from langchain_community.document_loaders import PyPDFLoader
|
|
|
|
|
7 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
8 |
|
9 |
+
api_base=os.getenv("AZURE_OPENAI_ENDPOINT")
|
10 |
+
api_version=os.getenv("OPENAI_API_VERSION")
|
11 |
+
|
12 |
+
|
13 |
|
14 |
class GenerateSearchQuery(dspy.Signature):
|
15 |
"""Write a simple search query that will help answer a complex question."""
|
|
|
33 |
def __init__(self, file_path,passages_per_hop=3, max_hops=2):
|
34 |
super().__init__()
|
35 |
self.cache = "cache.json"
|
36 |
+
self.llm = dspy.AzureOpenAI(api_base=api_base,
|
37 |
+
api_version=api_version,
|
38 |
+
model="GPT-4o")
|
39 |
|
40 |
self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
|
41 |
self.retrieve = dspy.Retrieve(k=passages_per_hop)
|
|
|
45 |
self.knowledge_base = self.create_knowledge_base(file_path)
|
46 |
|
47 |
def load_documents(self, file_path):
|
48 |
+
# print("file_path", file_path)
|
49 |
+
loader = PyPDFLoader(file_path)
|
50 |
documents = loader.load()
|
51 |
return documents
|
52 |
|
53 |
def split_documents(self, documents):
|
54 |
text_splitter = RecursiveCharacterTextSplitter(
|
55 |
+
chunk_size=10000,
|
56 |
chunk_overlap=0,
|
57 |
length_function=len,
|
58 |
is_separator_regex=False,
|
|
|
60 |
|
61 |
docs = text_splitter.split_documents(documents)
|
62 |
document_chunks = [page_content.page_content for page_content in docs]
|
63 |
+
# print("input context Ready")
|
64 |
return document_chunks
|
65 |
|
66 |
def create_knowledge_base(self, file_path):
|
67 |
+
# print("file_path", file_path)
|
68 |
document = self.load_documents(file_path)
|
69 |
split_documents = self.split_documents(document)
|
70 |
knowledge_base = FaissRM(split_documents)
|
|
|
73 |
def run(self,question):
|
74 |
dspy.settings.configure(lm=self.llm, rm=self.knowledge_base)
|
75 |
|
|
|
76 |
passages = self.retrieve(question).passages
|
77 |
+
print("passages", passages)
|
78 |
context = deduplicate(passages)
|
79 |
|
80 |
pred = self.generate_answer(context=context, question=question)
|