raghuv-aditya commited on
Commit
94b1e32
·
verified ·
1 Parent(s): 4f18307

Transfer of files

Browse files
modules/data_processor.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ from modules.scraper import get_raw_data, get_raw_data_sheets
4
+ from modules.embedding_storage import process_safety_with_chroma
5
+ from modules.qa_chatbot import create_chatbot, ask_question
6
+
7
+ def extract_column_name(query_template):
8
+ """
9
+ Extract the column name from the query template enclosed in curly braces.
10
+ """
11
+ match = re.search(r"\{(.*?)\}", query_template)
12
+ if not match:
13
+ raise ValueError("No placeholder found in the query template. Ensure the query contains a placeholder like {column_name}.")
14
+ return match.group(1)
15
+
16
+ def process_query_and_update_csv(file_path, query_template):
17
+ """
18
+ Processes the queries based on the specified column, updates the CSV file,
19
+ and adds an 'Answer' column with responses.
20
+ """
21
+ column_name = extract_column_name(query_template)
22
+ df = pd.read_csv(file_path)
23
+
24
+ if column_name not in df.columns:
25
+ raise ValueError(f"The specified column '{column_name}' is missing in the provided CSV file.")
26
+
27
+ if "Answer" not in df.columns:
28
+ df["Answer"] = ""
29
+
30
+ for index, row in df.iterrows():
31
+ value = row[column_name]
32
+ query = query_template.replace(f"{{{column_name}}}", str(value))
33
+
34
+ # Process the query using provided functions
35
+ raw_data = get_raw_data(file_path, query)
36
+ vector_store = process_safety_with_chroma(raw_data)
37
+ qa_system = create_chatbot(vector_store)
38
+ prompt = f"Give me the exact answer for this below query '{query}' in a structured format with a link from the content provided only."
39
+ answer = ask_question(qa_system, prompt)
40
+ df.at[index, "Answer"] = answer
41
+
42
+ df.to_csv(file_path, index=False)
43
+ return df
44
+
45
+
46
+ def process_query_and_update_sheets(file_path, df, query_template):
47
+ """
48
+ Processes the queries based on the specified column, updates the CSV file,
49
+ and adds an 'Answer' column with responses.
50
+ """
51
+ column_name = extract_column_name(query_template)
52
+ # df = pd.read_csv(file_path)
53
+
54
+ if column_name not in df.columns:
55
+ raise ValueError(f"The specified column '{column_name}' is missing in the provided CSV file.")
56
+
57
+ if "Answer" not in df.columns:
58
+ df["Answer"] = ""
59
+
60
+ for index, row in df.iterrows():
61
+ value = row[column_name]
62
+ query = query_template.replace(f"{{{column_name}}}", str(value))
63
+ # print( "Value : ", value, "Query : ", query)
64
+ # Process the query using provided functions
65
+ raw_data = get_raw_data_sheets(query)
66
+ vector_store = process_safety_with_chroma(raw_data)
67
+ qa_system = create_chatbot(vector_store)
68
+ prompt = f"Give me the exact answer for this below query '{query}' in a structured format with a link from the content provided only."
69
+ answer = ask_question(qa_system, prompt)
70
+ df.at[index, "Answer"] = answer
71
+
72
+ print("ddddddd")
73
+ print(df)
74
+ # df.to_csv(file_path, index=False)
75
+ return df
modules/embedding_storage.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import OpenAIEmbeddings
2
+ from langchain_chroma import Chroma
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.docstore.document import Document
5
+ import os
6
+
7
+ from config import PERSIST_DIRECTORY
8
+
9
+ def process_safety_with_chroma(data):
10
+ """
11
+ Processes and stores the given structured JSON data into ChromaDB.
12
+ Args:
13
+ data (list): A list of dictionaries containing structured JSON data.
14
+ Returns:
15
+ Chroma: The Chroma vector store object.
16
+ """
17
+
18
+ documents = []
19
+ # print("machidkkkk\n")
20
+ for item in data:
21
+ # print("machidkkkk\n")
22
+
23
+ # Extract fields from the JSON structure
24
+ content = item.get("snippet", "")
25
+ highlighted_words = item.get("snippet_highlighted_words", [])
26
+ highlighted_words_str = ", ".join(highlighted_words) if isinstance(highlighted_words, list) else str(highlighted_words)
27
+
28
+ metadata = {
29
+ "position": item.get("position"),
30
+ "title": item.get("title"),
31
+ "link": item.get("link"),
32
+ "source": item.get("source"),
33
+ "displayed_link": item.get("displayed_link"),
34
+ # Flatten highlighted_words list into a comma-separated string
35
+ "highlighted_words": ", ".join(highlighted_words) if isinstance(highlighted_words, list) else highlighted_words
36
+ }
37
+ # Create a document for each snippet
38
+ # print("ffffff")
39
+ # print ( "content", content)
40
+ if content:
41
+ content += f" Highlighted words: {highlighted_words_str}" if highlighted_words_str else ""
42
+ documents.append(Document(page_content=content, metadata=metadata))
43
+ # Initialize embeddings and Chroma store
44
+ embeddings = OpenAIEmbeddings()
45
+ vector_store = Chroma.from_documents(documents, embeddings, persist_directory=PERSIST_DIRECTORY)
46
+
47
+ return vector_store
modules/gsheet_handler.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google.oauth2.service_account import Credentials
2
+ from googleapiclient.discovery import build
3
+ import pandas as pd
4
+
5
+ def fetch_google_sheet_data(credentials_file, sheet_id, sheet_name):
6
+ """
7
+ Fetch data from Google Sheets.
8
+ """
9
+ try:
10
+ creds = Credentials.from_service_account_file(credentials_file, scopes=["https://www.googleapis.com/auth/spreadsheets.readonly"])
11
+ service = build('sheets', 'v4', credentials=creds)
12
+ sheet = service.spreadsheets()
13
+ result = sheet.values().get(spreadsheetId=sheet_id, range=sheet_name).execute()
14
+ data = result.get('values', [])
15
+ headers = data[0]
16
+ rows = data[1:]
17
+ return pd.DataFrame(rows, columns=headers)
18
+ except Exception as e:
19
+ return str(e)
20
+
21
+ def update_google_sheet(credentials_file, sheet_id, sheet_name, df):
22
+ """
23
+ Update Google Sheets with the processed data.
24
+ """
25
+ try:
26
+ creds = Credentials.from_service_account_file(credentials_file, scopes=["https://www.googleapis.com/auth/spreadsheets"])
27
+ service = build('sheets', 'v4', credentials=creds)
28
+ sheet = service.spreadsheets()
29
+
30
+ # Convert DataFrame to list of lists
31
+ data = [df.columns.tolist()] + df.values.tolist()
32
+
33
+ # Update the sheet
34
+ body = {'values': data}
35
+ sheet.values().update(
36
+ spreadsheetId=sheet_id,
37
+ range=sheet_name,
38
+ valueInputOption="RAW",
39
+ body=body
40
+ ).execute()
41
+ return "Google Sheet updated successfully."
42
+ except Exception as e:
43
+ return str(e)
modules/qa_chatbot.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import RetrievalQA
2
+ from langchain_openai import OpenAI
3
+ from langchain_chroma import Chroma
4
+
5
+ def create_chatbot(vector_store):
6
+ """
7
+ Creates a chatbot for querying the Chroma vector store.
8
+ Args:
9
+ vector_store (Chroma): The vector store to use.
10
+ Returns:
11
+ RetrievalQA: The QA chatbot object.
12
+ """
13
+ llm = OpenAI(temperature=0.5)
14
+ retriever = vector_store.as_retriever(search_type="mmr", k=5)
15
+
16
+ qa = RetrievalQA.from_chain_type(
17
+ llm=llm,
18
+ chain_type="stuff",
19
+ retriever=retriever,
20
+ return_source_documents=True
21
+ )
22
+ return qa
23
+
24
+
25
+ def ask_question(qa, query):
26
+ """
27
+ Asks a question to the chatbot and returns the response.
28
+ Args:
29
+ qa (RetrievalQA): The QA chatbot object.
30
+ query (str): The question to ask.
31
+ Returns:
32
+ str: The answer from the chatbot.
33
+ """
34
+ try:
35
+ response = qa.invoke({"query": query})
36
+ answer = response.get('result', 'No answer found.')
37
+ return f"{answer}\n"
38
+ except Exception as e:
39
+ return f"Error: {e}"
modules/scraper.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import random
3
+ import requests
4
+ import os
5
+ from dotenv import load_dotenv
6
+
7
+ # Load the CSV file
8
+ def load_csv(file_path):
9
+ try:
10
+ data = pd.read_csv(file_path)
11
+ print(f"File loaded successfully. Columns available: {list(data.columns)}")
12
+ return data
13
+ except Exception as e:
14
+ print(f"Error loading file: {e}")
15
+ return None
16
+
17
+ # Perform web search using SerpAPI
18
+ def search_web(query, api_key):
19
+ try:
20
+ # query = "Give me the name of director of " + query
21
+ url = f"https://serpapi.com/search.json?q={query}&api_key={api_key}"
22
+ response = requests.get(url)
23
+ if response.status_code == 200:
24
+ return response.json().get("organic_results", [])
25
+ else:
26
+ print(f"Error in search: {response.status_code}")
27
+ return []
28
+ except Exception as e:
29
+ print(f"Search failed: {e}")
30
+ return []
31
+
32
+ def get_raw_data(file_path, query):
33
+ # File path
34
+ load_dotenv()
35
+
36
+ # file_path = "example_input.csv" # Replace with your actual file path
37
+ api_key = os.getenv("SERPAPI_KEY")
38
+
39
+ # Load CSV
40
+ data = load_csv(file_path)
41
+ if data is None:
42
+ return
43
+
44
+ if not file_path or not api_key:
45
+ print("Error: Environment variables not set. Please check your .env file.")
46
+ return
47
+
48
+ # Load CSV
49
+ data = load_csv(file_path)
50
+ if data is None:
51
+ return
52
+
53
+ search_results = search_web(query, api_key)
54
+ # print(search_results)
55
+
56
+ return search_results
57
+ # Print the results
58
+ # for result in results:
59
+ # print(result)
60
+
61
+
62
+ def get_raw_data_sheets(query):
63
+ # File path
64
+ load_dotenv()
65
+
66
+ # file_path = "example_input.csv" # Replace with your actual file path
67
+ api_key = os.getenv("SERPAPI_KEY")
68
+
69
+ if not api_key:
70
+ print("Error: Environment variables not set. Please check your .env file.")
71
+ return
72
+
73
+ search_results = search_web(query, api_key)
74
+ # print(search_results)
75
+
76
+ return search_results
77
+