Spaces:

mdredze1
/

tobacco-watcher-chat-with-citations

Sleeping

App Files Files Community

vtiyyal1 commited on Dec 17, 2024

Commit

adef65e

verified ·

1 Parent(s): fe30496

Upload 6 files

Browse files

replaced langchain with open ai

Files changed (3) hide show

app.py +54 -45
feed_to_llm_v2.py +21 -16
requirements.txt +9 -15

app.py CHANGED Viewed

@@ -1,21 +1,18 @@
 import gradio as gr
 from full_chain import get_response
 import os
-import openai
 import json
-from langchain_openai import ChatOpenAI
-from langchain.schema import HumanMessage, SystemMessage
 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-# api_key = os.getenv("OPENAI_API_KEY")
-# client = openai.OpenAI(api_key=api_key)
-# Initialize ChatOpenAI
-llm = ChatOpenAI(
-    model="gpt-4o-mini",
-    temperature=0
-)
 def load_content(filename):
     """Load content from text files"""
@@ -34,54 +31,62 @@ def predict(message, history):
     """Process user message and return appropriate response."""
     try:
         # Query classification prompt
-        classifier_prompt = """You are the Tobacco Watcher Assistant. Analyze the user's query and categorize it into exactly ONE of these types:
-        1. HELP - Questions about using the website, its features, or navigation
-        Example: "How do I use filters?", "How to search for articles?"
-        2. ABOUT - Questions about Tobacco Watcher's purpose, mission, or organization
-        Example: "What is Tobacco Watcher?", "Who runs this website?"
-        3. FILTER - Requests for specific articles using filters
-        Example: "Show articles about smoking in India from 2023", "Find French articles about e-cigarettes"
-        4. QUERY - Questions seeking tobacco-related information
-        Example: "How many people smoke in Asia?", "What are the effects of secondhand smoke?"
-        Respond with ONLY the category name (HELP, ABOUT, FILTER, or QUERY).
-        """
-        messages = [
-            SystemMessage(content=classifier_prompt),
-            HumanMessage(content=message)
         ]
-        response = llm.invoke(messages)
-        query_type = response.content.strip().upper()
         print(f"Query type: {query_type}")
         if query_type == "HELP":
             help_content = load_content("help.txt")
-            messages = [
-                SystemMessage(content="""You are the Tobacco Watcher Help Assistant.
 Use the provided help content to guide users on how to use the platform's features.
-Be clear and specific in your instructions. If a feature isn't mentioned in the content, acknowledge that and suggest contacting support."""),
-                HumanMessage(content=f"Using this help content:\n\n{help_content}\n\nAnswer this question: {message}")
             ]
-            response = llm.invoke(messages)
-            return response.content
         elif query_type == "ABOUT":
             about_content = load_content("about.txt")
-            messages = [
-                SystemMessage(content="""You are the Tobacco Watcher Assistant specializing in explaining the platform.
 Use the provided content to answer questions about Tobacco Watcher's purpose, mission, features, and organization.
-Be concise but informative. If a specific detail isn't in the content, say so rather than making assumptions."""),
-                HumanMessage(content=f"Using this content:\n\n{about_content}\n\nAnswer this question: {message}")
             ]
-            response = llm.invoke(messages)
-            return response.content
         elif query_type == "FILTER":
             filter_options = load_filter_options()
@@ -108,14 +113,18 @@ Be concise but informative. If a specific detail isn't in the content, say so ra
             url_prompt += "\nGenerate a valid URL for this query. Return ONLY the complete URL."
-            messages = [
-                SystemMessage(content=url_prompt),
-                HumanMessage(content=message)
             ]
             try:
-                response = llm.invoke(messages)
-                url_response = response.content.strip()
                 print(f"Generated URL: {url_response}")
                 if url_response.startswith("http"):

 import gradio as gr
 from full_chain import get_response
 import os
+import json
+from openai import OpenAI
 import json
+# from langchain_openai import ChatOpenAI
+# from langchain.schema import HumanMessage, SystemMessage
 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+# Initialize OpenAI client
+client = OpenAI()  # It will automatically use OPENAI_API_KEY from environment
 def load_content(filename):
     """Load content from text files"""
     """Process user message and return appropriate response."""
     try:
         # Query classification prompt
+        classifier_messages = [
+            {"role": "system", "content": """You are the Tobacco Watcher Assistant. Analyze the user's query and categorize it into exactly ONE of these types:
+            1. HELP - Questions about using the website, its features, or navigation
+            Example: "How do I use filters?", "How to search for articles?"
+            2. ABOUT - Questions about Tobacco Watcher's purpose, mission, or organization
+            Example: "What is Tobacco Watcher?", "Who runs this website?"
+            3. FILTER - Requests for specific articles using filters
+            Example: "Show articles about smoking in India from 2023", "Find French articles about e-cigarettes"
+            4. QUERY - Questions seeking tobacco-related information
+            Example: "How many people smoke in Asia?", "What are the effects of secondhand smoke?"
+            Respond with ONLY the category name (HELP, ABOUT, FILTER, or QUERY)."""},
+            {"role": "user", "content": message}
         ]
+        completion = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=classifier_messages,
+            temperature=0
+        )
+        query_type = completion.choices[0].message.content.strip().upper()
         print(f"Query type: {query_type}")
         if query_type == "HELP":
             help_content = load_content("help.txt")
+            help_messages = [
+                {"role": "system", "content": """You are the Tobacco Watcher Help Assistant.
 Use the provided help content to guide users on how to use the platform's features.
+Be clear and specific in your instructions. If a feature isn't mentioned in the content, acknowledge that and suggest contacting support."""},
+                {"role": "user", "content": f"Using this help content:\n\n{help_content}\n\nAnswer this question: {message}"}
             ]
+            completion = client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=help_messages,
+                temperature=0
+            )
+            return completion.choices[0].message.content
         elif query_type == "ABOUT":
             about_content = load_content("about.txt")
+            about_messages = [
+                {"role": "system", "content": """You are the Tobacco Watcher Assistant specializing in explaining the platform.
 Use the provided content to answer questions about Tobacco Watcher's purpose, mission, features, and organization.
+Be concise but informative. If a specific detail isn't in the content, say so rather than making assumptions."""},
+                {"role": "user", "content": f"Using this content:\n\n{about_content}\n\nAnswer this question: {message}"}
             ]
+            completion = client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=about_messages,
+                temperature=0
+            )
+            return completion.choices[0].message.content
         elif query_type == "FILTER":
             filter_options = load_filter_options()
             url_prompt += "\nGenerate a valid URL for this query. Return ONLY the complete URL."
+            url_messages = [
+                {"role": "system", "content": url_prompt},
+                {"role": "user", "content": message}
             ]
             try:
+                completion = client.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=url_messages,
+                    temperature=0
+                )
+                url_response = completion.choices[0].message.content.strip()
                 print(f"Generated URL: {url_response}")
                 if url_response.startswith("http"):

feed_to_llm_v2.py CHANGED Viewed

@@ -1,9 +1,4 @@
-from langchain_openai import ChatOpenAI
-from langchain.schema import (
-    HumanMessage,
-    SystemMessage
-)
 import tiktoken
 import re
@@ -11,6 +6,7 @@ from get_articles import save_solr_articles_full
 from rerank import crossencoder_rerank_answer
 import logging
 from logging.handlers import RotatingFileHandler
 # Configure logging
 logger = logging.getLogger("TobaccoInfoAssistant")
@@ -23,11 +19,13 @@ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 def num_tokens_from_string(string: str, encoder) -> int:
     num_tokens = len(encoder.encode(string))
     return num_tokens
 def feed_articles_to_gpt_with_links(information, question):
     prompt = """
     You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.
@@ -60,6 +58,7 @@ def feed_articles_to_gpt_with_links(information, question):
     published_dates = [published_dates for score, contents, uuids, titles, domains, published_dates in information]
     logger.info(f"Article retrieved: {len(articles)}")
     logger.info(f"Article titles: {titles_list}")
     for i in range(len(articles)):
         addition = f"Article {i + 1}: {articles[i]} {separator}"
         token_count += num_tokens_from_string(addition, encoder)
@@ -69,14 +68,18 @@ def feed_articles_to_gpt_with_links(information, question):
     prompt += content
     logger.info(f"Prompt: {prompt}")
-    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
-    message = [
-        SystemMessage(content=prompt),
-        HumanMessage(content=question)
     ]
-    response = llm.invoke(message)
-    response_content = response.content  # Access the content of the AIMessage
     logger.info(f"LLM Response Content: {response_content}")
     # Extract sources from the response content
@@ -84,7 +87,7 @@ def feed_articles_to_gpt_with_links(information, question):
     parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)
     if not (inline_matches or parenthetical_matches):
-        return response_content, [], [], []
     # Combine and get unique article numbers
     all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
@@ -122,7 +125,7 @@ def feed_articles_to_gpt_with_links(information, question):
     cited_published_dates = []
     for article_num in used_article_nums:
         uuid = uuids[article_num]
-        link = f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/"
         cited_links.append(link)
         cited_titles.append(titles_list[article_num])
         cited_domains.append(domains_list[article_num])
@@ -133,6 +136,8 @@ if __name__ == "__main__":
     question = "How is United States fighting against tobacco addiction?"
     rerank_type = "crossencoder"
     llm_type = "chat"
-    csv_path = save_solr_articles_full(question, keyword_type="rake")
     reranked_out = crossencoder_rerank_answer(csv_path, question)
     feed_articles_to_gpt_with_links(reranked_out, question)

+from openai import OpenAI
 import tiktoken
 import re
 from rerank import crossencoder_rerank_answer
 import logging
 from logging.handlers import RotatingFileHandler
+import os
 # Configure logging
 logger = logging.getLogger("TobaccoInfoAssistant")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
+# Initialize OpenAI client
+client = OpenAI()
 def num_tokens_from_string(string: str, encoder) -> int:
     num_tokens = len(encoder.encode(string))
     return num_tokens
 def feed_articles_to_gpt_with_links(information, question):
     prompt = """
     You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.
     published_dates = [published_dates for score, contents, uuids, titles, domains, published_dates in information]
     logger.info(f"Article retrieved: {len(articles)}")
     logger.info(f"Article titles: {titles_list}")
     for i in range(len(articles)):
         addition = f"Article {i + 1}: {articles[i]} {separator}"
         token_count += num_tokens_from_string(addition, encoder)
     prompt += content
     logger.info(f"Prompt: {prompt}")
+    messages = [
+        {"role": "system", "content": prompt},
+        {"role": "user", "content": question}
     ]
+    completion = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=messages,
+        temperature=0
+    )
+    response_content = completion.choices[0].message.content
     logger.info(f"LLM Response Content: {response_content}")
     # Extract sources from the response content
     parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)
     if not (inline_matches or parenthetical_matches):
+        return response_content, [], [], [], []
     # Combine and get unique article numbers
     all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
     cited_published_dates = []
     for article_num in used_article_nums:
         uuid = uuids[article_num]
+        link = f"https://tobaccowatcher.globaltobactocontrol.org/articles/{uuid}/"
         cited_links.append(link)
         cited_titles.append(titles_list[article_num])
         cited_domains.append(domains_list[article_num])
     question = "How is United States fighting against tobacco addiction?"
     rerank_type = "crossencoder"
     llm_type = "chat"
+    from get_articles import save_solr_articles_full
+    from rerank import crossencoder_rerank_answer
+    csv_path = save_solr_articles_full(question, 15, keyword_type="rake")
     reranked_out = crossencoder_rerank_answer(csv_path, question)
     feed_articles_to_gpt_with_links(reranked_out, question)

requirements.txt CHANGED Viewed

@@ -1,15 +1,9 @@
-gradio==4.25.0
-langchain==0.1.14
-langchain-core==0.1.40
-langchain-openai==0.1.1
-nltk==3.8.1
-openai==1.16.2
-pandas==2.2.1
-pysolr==3.9.0
-rake-nltk==1.0.6
-sentence-transformers==2.2.2
-tiktoken==0.5.2
-torch==2.1.2
-huggingface-hub==0.20.2
-python-dotenv==1.0.1
-docarray==0.40.0

+gradio==4.25.0
+openai
+nltk==3.8.1
+pandas==2.2.1
+pysolr==3.9.0
+rake-nltk==1.0.6
+sentence-transformers==2.2.2
+tiktoken==0.5.2
+python-dotenv==1.0.1