akshansh36 commited on
Commit
9bd3750
·
verified ·
1 Parent(s): 7d4344b

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +73 -73
tools.py CHANGED
@@ -1,73 +1,73 @@
1
- from langchain_core.tools import tool
2
- import pinecone
3
- from langchain_google_genai import GoogleGenerativeAIEmbeddings
4
- import os
5
- from dotenv import load_dotenv
6
-
7
- load_dotenv()
8
- GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
9
- PINECONE_API = os.getenv("PINECONE_API_KEY")
10
-
11
- google_embeddings = GoogleGenerativeAIEmbeddings(
12
- model="models/embedding-001", # Correct model name
13
- google_api_key=GOOGLE_API_KEY
14
- )
15
-
16
- pc = pinecone.Pinecone(
17
- api_key=PINECONE_API
18
- )
19
-
20
- PINECONE_INDEX = "rites-pdf"
21
- index = pc.Index(PINECONE_INDEX)
22
-
23
- @tool
24
- def get_context(query: str) -> str:
25
- """
26
- Retrieve context information by performing a semantic search on indexed document chunks.
27
-
28
- This tool embeds the provided user query using a Google Generative AI embeddings model,
29
- then queries a Pinecone index to fetch the top 10 matching document chunks. Each match
30
- includes metadata such as the text chunk, starting page, ending page, and the source PDF URL.
31
- The function aggregates these details into a formatted string.
32
-
33
- Args:
34
- query (str): A user query search string used for semantic matching against the document index.
35
-
36
- Returns:
37
- str: A formatted string containing the matched document chunks along with their associated metadata,
38
- including start page, end page, and PDF URL.
39
- """
40
- embedding = google_embeddings.embed_query(query)
41
- search_results = index.query(
42
- vector=embedding,
43
- top_k=10, # Retrieve top 10 results
44
- include_metadata=True
45
- )
46
- context = " "
47
- count = 1
48
- for match in search_results["matches"]:
49
- chunk = match["metadata"].get("chunk")
50
- url = match["metadata"].get("pdf_url")
51
- start_page = match["metadata"].get("start_page")
52
- end_page = match["metadata"].get("end_page")
53
-
54
- context += f"""
55
- Chunk {count}:
56
- {chunk}
57
- start_page: {start_page}
58
- end_page: {end_page}
59
- pdf_url: {url}
60
- #########################################
61
- """
62
- count += 1
63
-
64
- return context
65
-
66
-
67
-
68
-
69
-
70
-
71
-
72
-
73
-
 
1
+ from langchain_core.tools import tool
2
+ import pinecone
3
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+ GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
9
+ PINECONE_API = os.getenv("PINECONE_API_KEY")
10
+
11
+ google_embeddings = GoogleGenerativeAIEmbeddings(
12
+ model="models/embedding-001", # Correct model name
13
+ google_api_key=GOOGLE_API_KEY
14
+ )
15
+
16
+ pc = pinecone.Pinecone(
17
+ api_key=PINECONE_API
18
+ )
19
+
20
+ PINECONE_INDEX = "rites-pdf"
21
+ index = pc.Index(PINECONE_INDEX)
22
+
23
+ @tool
24
+ def get_context(query: str) -> str:
25
+ """
26
+ Retrieve context information by performing a semantic search on indexed document chunks.
27
+
28
+ This tool embeds the provided user query using a Google Generative AI embeddings model,
29
+ then queries a Pinecone index to fetch the top 10 matching document chunks. Each match
30
+ includes metadata such as the text chunk, starting page, ending page, and the source PDF URL.
31
+ The function aggregates these details into a formatted string.
32
+
33
+ Args:
34
+ query (str): A user query search string used for semantic matching against the document index.
35
+
36
+ Returns:
37
+ str: A formatted string containing the matched document chunks along with their associated metadata,
38
+ including start page, end page, and PDF URL.
39
+ """
40
+ embedding = google_embeddings.embed_query(query)
41
+ search_results = index.query(
42
+ vector=embedding,
43
+ top_k=20, # Retrieve top 10 results
44
+ include_metadata=True
45
+ )
46
+ context = " "
47
+ count = 1
48
+ for match in search_results["matches"]:
49
+ chunk = match["metadata"].get("chunk")
50
+ url = match["metadata"].get("pdf_url")
51
+ start_page = match["metadata"].get("start_page")
52
+ end_page = match["metadata"].get("end_page")
53
+
54
+ context += f"""
55
+ Chunk {count}:
56
+ {chunk}
57
+ start_page: {start_page}
58
+ end_page: {end_page}
59
+ pdf_url: {url}
60
+ #########################################
61
+ """
62
+ count += 1
63
+
64
+ return context
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+