Spaces:

raghuv-aditya
/

Query-Pilot

Sleeping

App Files Files Community

raghuv-aditya commited on Nov 17, 2024

Commit

94b1e32

verified ·

1 Parent(s): 4f18307

Transfer of files

Browse files

Files changed (5) hide show

modules/data_processor.py +75 -0
modules/embedding_storage.py +47 -0
modules/gsheet_handler.py +43 -0
modules/qa_chatbot.py +39 -0
modules/scraper.py +77 -0

modules/data_processor.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import re
+import pandas as pd
+from modules.scraper import get_raw_data, get_raw_data_sheets
+from modules.embedding_storage import process_safety_with_chroma
+from modules.qa_chatbot import create_chatbot, ask_question
+def extract_column_name(query_template):
+    """
+    Extract the column name from the query template enclosed in curly braces.
+    """
+    match = re.search(r"\{(.*?)\}", query_template)
+    if not match:
+        raise ValueError("No placeholder found in the query template. Ensure the query contains a placeholder like {column_name}.")
+    return match.group(1)
+def process_query_and_update_csv(file_path, query_template):
+    """
+    Processes the queries based on the specified column, updates the CSV file,
+    and adds an 'Answer' column with responses.
+    """
+    column_name = extract_column_name(query_template)
+    df = pd.read_csv(file_path)
+    if column_name not in df.columns:
+        raise ValueError(f"The specified column '{column_name}' is missing in the provided CSV file.")
+    if "Answer" not in df.columns:
+        df["Answer"] = ""
+    for index, row in df.iterrows():
+        value = row[column_name]
+        query = query_template.replace(f"{{{column_name}}}", str(value))
+        # Process the query using provided functions
+        raw_data = get_raw_data(file_path, query)
+        vector_store = process_safety_with_chroma(raw_data)
+        qa_system = create_chatbot(vector_store)
+        prompt = f"Give me the exact answer for this below query '{query}' in a structured format with a link from the content provided only."
+        answer = ask_question(qa_system, prompt)
+        df.at[index, "Answer"] = answer
+    df.to_csv(file_path, index=False)
+    return df
+def process_query_and_update_sheets(file_path, df, query_template):
+    """
+    Processes the queries based on the specified column, updates the CSV file,
+    and adds an 'Answer' column with responses.
+    """
+    column_name = extract_column_name(query_template)
+    # df = pd.read_csv(file_path)
+    if column_name not in df.columns:
+        raise ValueError(f"The specified column '{column_name}' is missing in the provided CSV file.")
+    if "Answer" not in df.columns:
+        df["Answer"] = ""
+    for index, row in df.iterrows():
+        value = row[column_name]
+        query = query_template.replace(f"{{{column_name}}}", str(value))
+        # print( "Value : ", value, "Query : ", query)
+        # Process the query using provided functions
+        raw_data = get_raw_data_sheets(query)
+        vector_store = process_safety_with_chroma(raw_data)
+        qa_system = create_chatbot(vector_store)
+        prompt = f"Give me the exact answer for this below query '{query}' in a structured format with a link from the content provided only."
+        answer = ask_question(qa_system, prompt)
+        df.at[index, "Answer"] = answer
+    print("ddddddd")
+    print(df)
+    # df.to_csv(file_path, index=False)
+    return df

modules/embedding_storage.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from langchain_openai import OpenAIEmbeddings
+from langchain_chroma import Chroma
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.docstore.document import Document
+import os
+from config import PERSIST_DIRECTORY
+def process_safety_with_chroma(data):
+    """
+    Processes and stores the given structured JSON data into ChromaDB.
+    Args:
+        data (list): A list of dictionaries containing structured JSON data.
+    Returns:
+        Chroma: The Chroma vector store object.
+    """
+    documents = []
+    # print("machidkkkk\n")
+    for item in data:
+        # print("machidkkkk\n")
+        # Extract fields from the JSON structure
+        content = item.get("snippet", "")
+        highlighted_words = item.get("snippet_highlighted_words", [])
+        highlighted_words_str = ", ".join(highlighted_words) if isinstance(highlighted_words, list) else str(highlighted_words)
+        metadata = {
+            "position": item.get("position"),
+            "title": item.get("title"),
+            "link": item.get("link"),
+            "source": item.get("source"),
+            "displayed_link": item.get("displayed_link"),
+            # Flatten highlighted_words list into a comma-separated string
+            "highlighted_words": ", ".join(highlighted_words) if isinstance(highlighted_words, list) else highlighted_words
+        }
+            # Create a document for each snippet
+        # print("ffffff")
+        # print ( "content", content)
+        if content:
+            content += f" Highlighted words: {highlighted_words_str}" if highlighted_words_str else ""
+            documents.append(Document(page_content=content, metadata=metadata))
+    # Initialize embeddings and Chroma store
+    embeddings = OpenAIEmbeddings()
+    vector_store = Chroma.from_documents(documents, embeddings, persist_directory=PERSIST_DIRECTORY)
+    return vector_store

modules/gsheet_handler.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from google.oauth2.service_account import Credentials
+from googleapiclient.discovery import build
+import pandas as pd
+def fetch_google_sheet_data(credentials_file, sheet_id, sheet_name):
+    """
+    Fetch data from Google Sheets.
+    """
+    try:
+        creds = Credentials.from_service_account_file(credentials_file, scopes=["https://www.googleapis.com/auth/spreadsheets.readonly"])
+        service = build('sheets', 'v4', credentials=creds)
+        sheet = service.spreadsheets()
+        result = sheet.values().get(spreadsheetId=sheet_id, range=sheet_name).execute()
+        data = result.get('values', [])
+        headers = data[0]
+        rows = data[1:]
+        return pd.DataFrame(rows, columns=headers)
+    except Exception as e:
+        return str(e)
+def update_google_sheet(credentials_file, sheet_id, sheet_name, df):
+    """
+    Update Google Sheets with the processed data.
+    """
+    try:
+        creds = Credentials.from_service_account_file(credentials_file, scopes=["https://www.googleapis.com/auth/spreadsheets"])
+        service = build('sheets', 'v4', credentials=creds)
+        sheet = service.spreadsheets()
+        # Convert DataFrame to list of lists
+        data = [df.columns.tolist()] + df.values.tolist()
+        # Update the sheet
+        body = {'values': data}
+        sheet.values().update(
+            spreadsheetId=sheet_id,
+            range=sheet_name,
+            valueInputOption="RAW",
+            body=body
+        ).execute()
+        return "Google Sheet updated successfully."
+    except Exception as e:
+        return str(e)

modules/qa_chatbot.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from langchain.chains import RetrievalQA
+from langchain_openai import OpenAI
+from langchain_chroma import Chroma
+def create_chatbot(vector_store):
+    """
+    Creates a chatbot for querying the Chroma vector store.
+    Args:
+        vector_store (Chroma): The vector store to use.
+    Returns:
+        RetrievalQA: The QA chatbot object.
+    """
+    llm = OpenAI(temperature=0.5)
+    retriever = vector_store.as_retriever(search_type="mmr", k=5)
+    qa = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=retriever,
+        return_source_documents=True
+    )
+    return qa
+def ask_question(qa, query):
+    """
+    Asks a question to the chatbot and returns the response.
+    Args:
+        qa (RetrievalQA): The QA chatbot object.
+        query (str): The question to ask.
+    Returns:
+        str: The answer from the chatbot.
+    """
+    try:
+        response = qa.invoke({"query": query})
+        answer = response.get('result', 'No answer found.')
+        return f"{answer}\n"
+    except Exception as e:
+        return f"Error: {e}"

modules/scraper.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import pandas as pd
+import random
+import requests
+import os
+from dotenv import load_dotenv
+# Load the CSV file
+def load_csv(file_path):
+    try:
+        data = pd.read_csv(file_path)
+        print(f"File loaded successfully. Columns available: {list(data.columns)}")
+        return data
+    except Exception as e:
+        print(f"Error loading file: {e}")
+        return None
+# Perform web search using SerpAPI
+def search_web(query, api_key):
+    try:
+        # query = "Give me the name of director of " + query
+        url = f"https://serpapi.com/search.json?q={query}&api_key={api_key}"
+        response = requests.get(url)
+        if response.status_code == 200:
+            return response.json().get("organic_results", [])
+        else:
+            print(f"Error in search: {response.status_code}")
+            return []
+    except Exception as e:
+        print(f"Search failed: {e}")
+        return []
+def get_raw_data(file_path, query):
+    # File path
+    load_dotenv()
+    # file_path = "example_input.csv"  # Replace with your actual file path
+    api_key = os.getenv("SERPAPI_KEY")
+    # Load CSV
+    data = load_csv(file_path)
+    if data is None:
+        return
+    if not file_path or not api_key:
+        print("Error: Environment variables not set. Please check your .env file.")
+        return
+    # Load CSV
+    data = load_csv(file_path)
+    if data is None:
+        return
+    search_results = search_web(query, api_key)
+    # print(search_results)
+    return search_results
+    # Print the results
+    # for result in results:
+    #     print(result)
+def get_raw_data_sheets(query):
+    # File path
+    load_dotenv()
+    # file_path = "example_input.csv"  # Replace with your actual file path
+    api_key = os.getenv("SERPAPI_KEY")
+    if not api_key:
+        print("Error: Environment variables not set. Please check your .env file.")
+        return
+    search_results = search_web(query, api_key)
+    # print(search_results)
+    return search_results