Spaces:
Sleeping
Sleeping
Transfer of files
Browse files- modules/data_processor.py +75 -0
- modules/embedding_storage.py +47 -0
- modules/gsheet_handler.py +43 -0
- modules/qa_chatbot.py +39 -0
- modules/scraper.py +77 -0
modules/data_processor.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pandas as pd
|
3 |
+
from modules.scraper import get_raw_data, get_raw_data_sheets
|
4 |
+
from modules.embedding_storage import process_safety_with_chroma
|
5 |
+
from modules.qa_chatbot import create_chatbot, ask_question
|
6 |
+
|
7 |
+
def extract_column_name(query_template):
|
8 |
+
"""
|
9 |
+
Extract the column name from the query template enclosed in curly braces.
|
10 |
+
"""
|
11 |
+
match = re.search(r"\{(.*?)\}", query_template)
|
12 |
+
if not match:
|
13 |
+
raise ValueError("No placeholder found in the query template. Ensure the query contains a placeholder like {column_name}.")
|
14 |
+
return match.group(1)
|
15 |
+
|
16 |
+
def process_query_and_update_csv(file_path, query_template):
|
17 |
+
"""
|
18 |
+
Processes the queries based on the specified column, updates the CSV file,
|
19 |
+
and adds an 'Answer' column with responses.
|
20 |
+
"""
|
21 |
+
column_name = extract_column_name(query_template)
|
22 |
+
df = pd.read_csv(file_path)
|
23 |
+
|
24 |
+
if column_name not in df.columns:
|
25 |
+
raise ValueError(f"The specified column '{column_name}' is missing in the provided CSV file.")
|
26 |
+
|
27 |
+
if "Answer" not in df.columns:
|
28 |
+
df["Answer"] = ""
|
29 |
+
|
30 |
+
for index, row in df.iterrows():
|
31 |
+
value = row[column_name]
|
32 |
+
query = query_template.replace(f"{{{column_name}}}", str(value))
|
33 |
+
|
34 |
+
# Process the query using provided functions
|
35 |
+
raw_data = get_raw_data(file_path, query)
|
36 |
+
vector_store = process_safety_with_chroma(raw_data)
|
37 |
+
qa_system = create_chatbot(vector_store)
|
38 |
+
prompt = f"Give me the exact answer for this below query '{query}' in a structured format with a link from the content provided only."
|
39 |
+
answer = ask_question(qa_system, prompt)
|
40 |
+
df.at[index, "Answer"] = answer
|
41 |
+
|
42 |
+
df.to_csv(file_path, index=False)
|
43 |
+
return df
|
44 |
+
|
45 |
+
|
46 |
+
def process_query_and_update_sheets(file_path, df, query_template):
|
47 |
+
"""
|
48 |
+
Processes the queries based on the specified column, updates the CSV file,
|
49 |
+
and adds an 'Answer' column with responses.
|
50 |
+
"""
|
51 |
+
column_name = extract_column_name(query_template)
|
52 |
+
# df = pd.read_csv(file_path)
|
53 |
+
|
54 |
+
if column_name not in df.columns:
|
55 |
+
raise ValueError(f"The specified column '{column_name}' is missing in the provided CSV file.")
|
56 |
+
|
57 |
+
if "Answer" not in df.columns:
|
58 |
+
df["Answer"] = ""
|
59 |
+
|
60 |
+
for index, row in df.iterrows():
|
61 |
+
value = row[column_name]
|
62 |
+
query = query_template.replace(f"{{{column_name}}}", str(value))
|
63 |
+
# print( "Value : ", value, "Query : ", query)
|
64 |
+
# Process the query using provided functions
|
65 |
+
raw_data = get_raw_data_sheets(query)
|
66 |
+
vector_store = process_safety_with_chroma(raw_data)
|
67 |
+
qa_system = create_chatbot(vector_store)
|
68 |
+
prompt = f"Give me the exact answer for this below query '{query}' in a structured format with a link from the content provided only."
|
69 |
+
answer = ask_question(qa_system, prompt)
|
70 |
+
df.at[index, "Answer"] = answer
|
71 |
+
|
72 |
+
print("ddddddd")
|
73 |
+
print(df)
|
74 |
+
# df.to_csv(file_path, index=False)
|
75 |
+
return df
|
modules/embedding_storage.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_openai import OpenAIEmbeddings
|
2 |
+
from langchain_chroma import Chroma
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain.docstore.document import Document
|
5 |
+
import os
|
6 |
+
|
7 |
+
from config import PERSIST_DIRECTORY
|
8 |
+
|
9 |
+
def process_safety_with_chroma(data):
|
10 |
+
"""
|
11 |
+
Processes and stores the given structured JSON data into ChromaDB.
|
12 |
+
Args:
|
13 |
+
data (list): A list of dictionaries containing structured JSON data.
|
14 |
+
Returns:
|
15 |
+
Chroma: The Chroma vector store object.
|
16 |
+
"""
|
17 |
+
|
18 |
+
documents = []
|
19 |
+
# print("machidkkkk\n")
|
20 |
+
for item in data:
|
21 |
+
# print("machidkkkk\n")
|
22 |
+
|
23 |
+
# Extract fields from the JSON structure
|
24 |
+
content = item.get("snippet", "")
|
25 |
+
highlighted_words = item.get("snippet_highlighted_words", [])
|
26 |
+
highlighted_words_str = ", ".join(highlighted_words) if isinstance(highlighted_words, list) else str(highlighted_words)
|
27 |
+
|
28 |
+
metadata = {
|
29 |
+
"position": item.get("position"),
|
30 |
+
"title": item.get("title"),
|
31 |
+
"link": item.get("link"),
|
32 |
+
"source": item.get("source"),
|
33 |
+
"displayed_link": item.get("displayed_link"),
|
34 |
+
# Flatten highlighted_words list into a comma-separated string
|
35 |
+
"highlighted_words": ", ".join(highlighted_words) if isinstance(highlighted_words, list) else highlighted_words
|
36 |
+
}
|
37 |
+
# Create a document for each snippet
|
38 |
+
# print("ffffff")
|
39 |
+
# print ( "content", content)
|
40 |
+
if content:
|
41 |
+
content += f" Highlighted words: {highlighted_words_str}" if highlighted_words_str else ""
|
42 |
+
documents.append(Document(page_content=content, metadata=metadata))
|
43 |
+
# Initialize embeddings and Chroma store
|
44 |
+
embeddings = OpenAIEmbeddings()
|
45 |
+
vector_store = Chroma.from_documents(documents, embeddings, persist_directory=PERSIST_DIRECTORY)
|
46 |
+
|
47 |
+
return vector_store
|
modules/gsheet_handler.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from google.oauth2.service_account import Credentials
|
2 |
+
from googleapiclient.discovery import build
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
def fetch_google_sheet_data(credentials_file, sheet_id, sheet_name):
|
6 |
+
"""
|
7 |
+
Fetch data from Google Sheets.
|
8 |
+
"""
|
9 |
+
try:
|
10 |
+
creds = Credentials.from_service_account_file(credentials_file, scopes=["https://www.googleapis.com/auth/spreadsheets.readonly"])
|
11 |
+
service = build('sheets', 'v4', credentials=creds)
|
12 |
+
sheet = service.spreadsheets()
|
13 |
+
result = sheet.values().get(spreadsheetId=sheet_id, range=sheet_name).execute()
|
14 |
+
data = result.get('values', [])
|
15 |
+
headers = data[0]
|
16 |
+
rows = data[1:]
|
17 |
+
return pd.DataFrame(rows, columns=headers)
|
18 |
+
except Exception as e:
|
19 |
+
return str(e)
|
20 |
+
|
21 |
+
def update_google_sheet(credentials_file, sheet_id, sheet_name, df):
|
22 |
+
"""
|
23 |
+
Update Google Sheets with the processed data.
|
24 |
+
"""
|
25 |
+
try:
|
26 |
+
creds = Credentials.from_service_account_file(credentials_file, scopes=["https://www.googleapis.com/auth/spreadsheets"])
|
27 |
+
service = build('sheets', 'v4', credentials=creds)
|
28 |
+
sheet = service.spreadsheets()
|
29 |
+
|
30 |
+
# Convert DataFrame to list of lists
|
31 |
+
data = [df.columns.tolist()] + df.values.tolist()
|
32 |
+
|
33 |
+
# Update the sheet
|
34 |
+
body = {'values': data}
|
35 |
+
sheet.values().update(
|
36 |
+
spreadsheetId=sheet_id,
|
37 |
+
range=sheet_name,
|
38 |
+
valueInputOption="RAW",
|
39 |
+
body=body
|
40 |
+
).execute()
|
41 |
+
return "Google Sheet updated successfully."
|
42 |
+
except Exception as e:
|
43 |
+
return str(e)
|
modules/qa_chatbot.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chains import RetrievalQA
|
2 |
+
from langchain_openai import OpenAI
|
3 |
+
from langchain_chroma import Chroma
|
4 |
+
|
5 |
+
def create_chatbot(vector_store):
|
6 |
+
"""
|
7 |
+
Creates a chatbot for querying the Chroma vector store.
|
8 |
+
Args:
|
9 |
+
vector_store (Chroma): The vector store to use.
|
10 |
+
Returns:
|
11 |
+
RetrievalQA: The QA chatbot object.
|
12 |
+
"""
|
13 |
+
llm = OpenAI(temperature=0.5)
|
14 |
+
retriever = vector_store.as_retriever(search_type="mmr", k=5)
|
15 |
+
|
16 |
+
qa = RetrievalQA.from_chain_type(
|
17 |
+
llm=llm,
|
18 |
+
chain_type="stuff",
|
19 |
+
retriever=retriever,
|
20 |
+
return_source_documents=True
|
21 |
+
)
|
22 |
+
return qa
|
23 |
+
|
24 |
+
|
25 |
+
def ask_question(qa, query):
|
26 |
+
"""
|
27 |
+
Asks a question to the chatbot and returns the response.
|
28 |
+
Args:
|
29 |
+
qa (RetrievalQA): The QA chatbot object.
|
30 |
+
query (str): The question to ask.
|
31 |
+
Returns:
|
32 |
+
str: The answer from the chatbot.
|
33 |
+
"""
|
34 |
+
try:
|
35 |
+
response = qa.invoke({"query": query})
|
36 |
+
answer = response.get('result', 'No answer found.')
|
37 |
+
return f"{answer}\n"
|
38 |
+
except Exception as e:
|
39 |
+
return f"Error: {e}"
|
modules/scraper.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import random
|
3 |
+
import requests
|
4 |
+
import os
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
|
7 |
+
# Load the CSV file
|
8 |
+
def load_csv(file_path):
|
9 |
+
try:
|
10 |
+
data = pd.read_csv(file_path)
|
11 |
+
print(f"File loaded successfully. Columns available: {list(data.columns)}")
|
12 |
+
return data
|
13 |
+
except Exception as e:
|
14 |
+
print(f"Error loading file: {e}")
|
15 |
+
return None
|
16 |
+
|
17 |
+
# Perform web search using SerpAPI
|
18 |
+
def search_web(query, api_key):
|
19 |
+
try:
|
20 |
+
# query = "Give me the name of director of " + query
|
21 |
+
url = f"https://serpapi.com/search.json?q={query}&api_key={api_key}"
|
22 |
+
response = requests.get(url)
|
23 |
+
if response.status_code == 200:
|
24 |
+
return response.json().get("organic_results", [])
|
25 |
+
else:
|
26 |
+
print(f"Error in search: {response.status_code}")
|
27 |
+
return []
|
28 |
+
except Exception as e:
|
29 |
+
print(f"Search failed: {e}")
|
30 |
+
return []
|
31 |
+
|
32 |
+
def get_raw_data(file_path, query):
|
33 |
+
# File path
|
34 |
+
load_dotenv()
|
35 |
+
|
36 |
+
# file_path = "example_input.csv" # Replace with your actual file path
|
37 |
+
api_key = os.getenv("SERPAPI_KEY")
|
38 |
+
|
39 |
+
# Load CSV
|
40 |
+
data = load_csv(file_path)
|
41 |
+
if data is None:
|
42 |
+
return
|
43 |
+
|
44 |
+
if not file_path or not api_key:
|
45 |
+
print("Error: Environment variables not set. Please check your .env file.")
|
46 |
+
return
|
47 |
+
|
48 |
+
# Load CSV
|
49 |
+
data = load_csv(file_path)
|
50 |
+
if data is None:
|
51 |
+
return
|
52 |
+
|
53 |
+
search_results = search_web(query, api_key)
|
54 |
+
# print(search_results)
|
55 |
+
|
56 |
+
return search_results
|
57 |
+
# Print the results
|
58 |
+
# for result in results:
|
59 |
+
# print(result)
|
60 |
+
|
61 |
+
|
62 |
+
def get_raw_data_sheets(query):
|
63 |
+
# File path
|
64 |
+
load_dotenv()
|
65 |
+
|
66 |
+
# file_path = "example_input.csv" # Replace with your actual file path
|
67 |
+
api_key = os.getenv("SERPAPI_KEY")
|
68 |
+
|
69 |
+
if not api_key:
|
70 |
+
print("Error: Environment variables not set. Please check your .env file.")
|
71 |
+
return
|
72 |
+
|
73 |
+
search_results = search_web(query, api_key)
|
74 |
+
# print(search_results)
|
75 |
+
|
76 |
+
return search_results
|
77 |
+
|