vtiyyal1 commited on
Commit
adef65e
·
verified ·
1 Parent(s): fe30496

Upload 6 files

Browse files

replaced langchain with open ai

Files changed (3) hide show
  1. app.py +54 -45
  2. feed_to_llm_v2.py +21 -16
  3. requirements.txt +9 -15
app.py CHANGED
@@ -1,21 +1,18 @@
1
  import gradio as gr
2
  from full_chain import get_response
3
  import os
4
- import openai
 
5
  import json
6
 
7
- from langchain_openai import ChatOpenAI
8
- from langchain.schema import HumanMessage, SystemMessage
9
  import urllib3
10
 
11
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
12
- # api_key = os.getenv("OPENAI_API_KEY")
13
- # client = openai.OpenAI(api_key=api_key)
14
- # Initialize ChatOpenAI
15
- llm = ChatOpenAI(
16
- model="gpt-4o-mini",
17
- temperature=0
18
- )
19
 
20
  def load_content(filename):
21
  """Load content from text files"""
@@ -34,54 +31,62 @@ def predict(message, history):
34
  """Process user message and return appropriate response."""
35
  try:
36
  # Query classification prompt
37
- classifier_prompt = """You are the Tobacco Watcher Assistant. Analyze the user's query and categorize it into exactly ONE of these types:
 
38
 
39
- 1. HELP - Questions about using the website, its features, or navigation
40
- Example: "How do I use filters?", "How to search for articles?"
41
 
42
- 2. ABOUT - Questions about Tobacco Watcher's purpose, mission, or organization
43
- Example: "What is Tobacco Watcher?", "Who runs this website?"
44
 
45
- 3. FILTER - Requests for specific articles using filters
46
- Example: "Show articles about smoking in India from 2023", "Find French articles about e-cigarettes"
47
 
48
- 4. QUERY - Questions seeking tobacco-related information
49
- Example: "How many people smoke in Asia?", "What are the effects of secondhand smoke?"
50
 
51
- Respond with ONLY the category name (HELP, ABOUT, FILTER, or QUERY).
52
- """
53
-
54
- messages = [
55
- SystemMessage(content=classifier_prompt),
56
- HumanMessage(content=message)
57
  ]
58
 
59
- response = llm.invoke(messages)
60
- query_type = response.content.strip().upper()
 
 
 
 
61
  print(f"Query type: {query_type}")
62
 
63
  if query_type == "HELP":
64
  help_content = load_content("help.txt")
65
- messages = [
66
- SystemMessage(content="""You are the Tobacco Watcher Help Assistant.
67
  Use the provided help content to guide users on how to use the platform's features.
68
- Be clear and specific in your instructions. If a feature isn't mentioned in the content, acknowledge that and suggest contacting support."""),
69
- HumanMessage(content=f"Using this help content:\n\n{help_content}\n\nAnswer this question: {message}")
70
  ]
71
-
72
- response = llm.invoke(messages)
73
- return response.content
 
 
 
74
 
75
  elif query_type == "ABOUT":
76
  about_content = load_content("about.txt")
77
- messages = [
78
- SystemMessage(content="""You are the Tobacco Watcher Assistant specializing in explaining the platform.
79
  Use the provided content to answer questions about Tobacco Watcher's purpose, mission, features, and organization.
80
- Be concise but informative. If a specific detail isn't in the content, say so rather than making assumptions."""),
81
- HumanMessage(content=f"Using this content:\n\n{about_content}\n\nAnswer this question: {message}")
82
  ]
83
- response = llm.invoke(messages)
84
- return response.content
 
 
 
 
85
 
86
  elif query_type == "FILTER":
87
  filter_options = load_filter_options()
@@ -108,14 +113,18 @@ Be concise but informative. If a specific detail isn't in the content, say so ra
108
 
109
  url_prompt += "\nGenerate a valid URL for this query. Return ONLY the complete URL."
110
 
111
- messages = [
112
- SystemMessage(content=url_prompt),
113
- HumanMessage(content=message)
114
  ]
115
 
116
  try:
117
- response = llm.invoke(messages)
118
- url_response = response.content.strip()
 
 
 
 
119
  print(f"Generated URL: {url_response}")
120
 
121
  if url_response.startswith("http"):
 
1
  import gradio as gr
2
  from full_chain import get_response
3
  import os
4
+ import json
5
+ from openai import OpenAI
6
  import json
7
 
8
+ # from langchain_openai import ChatOpenAI
9
+ # from langchain.schema import HumanMessage, SystemMessage
10
  import urllib3
11
 
12
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
13
+
14
+ # Initialize OpenAI client
15
+ client = OpenAI() # It will automatically use OPENAI_API_KEY from environment
 
 
 
 
16
 
17
  def load_content(filename):
18
  """Load content from text files"""
 
31
  """Process user message and return appropriate response."""
32
  try:
33
  # Query classification prompt
34
+ classifier_messages = [
35
+ {"role": "system", "content": """You are the Tobacco Watcher Assistant. Analyze the user's query and categorize it into exactly ONE of these types:
36
 
37
+ 1. HELP - Questions about using the website, its features, or navigation
38
+ Example: "How do I use filters?", "How to search for articles?"
39
 
40
+ 2. ABOUT - Questions about Tobacco Watcher's purpose, mission, or organization
41
+ Example: "What is Tobacco Watcher?", "Who runs this website?"
42
 
43
+ 3. FILTER - Requests for specific articles using filters
44
+ Example: "Show articles about smoking in India from 2023", "Find French articles about e-cigarettes"
45
 
46
+ 4. QUERY - Questions seeking tobacco-related information
47
+ Example: "How many people smoke in Asia?", "What are the effects of secondhand smoke?"
48
 
49
+ Respond with ONLY the category name (HELP, ABOUT, FILTER, or QUERY)."""},
50
+ {"role": "user", "content": message}
 
 
 
 
51
  ]
52
 
53
+ completion = client.chat.completions.create(
54
+ model="gpt-4o-mini",
55
+ messages=classifier_messages,
56
+ temperature=0
57
+ )
58
+ query_type = completion.choices[0].message.content.strip().upper()
59
  print(f"Query type: {query_type}")
60
 
61
  if query_type == "HELP":
62
  help_content = load_content("help.txt")
63
+ help_messages = [
64
+ {"role": "system", "content": """You are the Tobacco Watcher Help Assistant.
65
  Use the provided help content to guide users on how to use the platform's features.
66
+ Be clear and specific in your instructions. If a feature isn't mentioned in the content, acknowledge that and suggest contacting support."""},
67
+ {"role": "user", "content": f"Using this help content:\n\n{help_content}\n\nAnswer this question: {message}"}
68
  ]
69
+ completion = client.chat.completions.create(
70
+ model="gpt-4o-mini",
71
+ messages=help_messages,
72
+ temperature=0
73
+ )
74
+ return completion.choices[0].message.content
75
 
76
  elif query_type == "ABOUT":
77
  about_content = load_content("about.txt")
78
+ about_messages = [
79
+ {"role": "system", "content": """You are the Tobacco Watcher Assistant specializing in explaining the platform.
80
  Use the provided content to answer questions about Tobacco Watcher's purpose, mission, features, and organization.
81
+ Be concise but informative. If a specific detail isn't in the content, say so rather than making assumptions."""},
82
+ {"role": "user", "content": f"Using this content:\n\n{about_content}\n\nAnswer this question: {message}"}
83
  ]
84
+ completion = client.chat.completions.create(
85
+ model="gpt-4o-mini",
86
+ messages=about_messages,
87
+ temperature=0
88
+ )
89
+ return completion.choices[0].message.content
90
 
91
  elif query_type == "FILTER":
92
  filter_options = load_filter_options()
 
113
 
114
  url_prompt += "\nGenerate a valid URL for this query. Return ONLY the complete URL."
115
 
116
+ url_messages = [
117
+ {"role": "system", "content": url_prompt},
118
+ {"role": "user", "content": message}
119
  ]
120
 
121
  try:
122
+ completion = client.chat.completions.create(
123
+ model="gpt-4o-mini",
124
+ messages=url_messages,
125
+ temperature=0
126
+ )
127
+ url_response = completion.choices[0].message.content.strip()
128
  print(f"Generated URL: {url_response}")
129
 
130
  if url_response.startswith("http"):
feed_to_llm_v2.py CHANGED
@@ -1,9 +1,4 @@
1
- from langchain_openai import ChatOpenAI
2
-
3
- from langchain.schema import (
4
- HumanMessage,
5
- SystemMessage
6
- )
7
  import tiktoken
8
  import re
9
 
@@ -11,6 +6,7 @@ from get_articles import save_solr_articles_full
11
  from rerank import crossencoder_rerank_answer
12
  import logging
13
  from logging.handlers import RotatingFileHandler
 
14
 
15
  # Configure logging
16
  logger = logging.getLogger("TobaccoInfoAssistant")
@@ -23,11 +19,13 @@ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
23
  handler.setFormatter(formatter)
24
  logger.addHandler(handler)
25
 
 
 
 
26
  def num_tokens_from_string(string: str, encoder) -> int:
27
  num_tokens = len(encoder.encode(string))
28
  return num_tokens
29
 
30
-
31
  def feed_articles_to_gpt_with_links(information, question):
32
  prompt = """
33
  You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.
@@ -60,6 +58,7 @@ def feed_articles_to_gpt_with_links(information, question):
60
  published_dates = [published_dates for score, contents, uuids, titles, domains, published_dates in information]
61
  logger.info(f"Article retrieved: {len(articles)}")
62
  logger.info(f"Article titles: {titles_list}")
 
63
  for i in range(len(articles)):
64
  addition = f"Article {i + 1}: {articles[i]} {separator}"
65
  token_count += num_tokens_from_string(addition, encoder)
@@ -69,14 +68,18 @@ def feed_articles_to_gpt_with_links(information, question):
69
 
70
  prompt += content
71
  logger.info(f"Prompt: {prompt}")
72
- llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
73
- message = [
74
- SystemMessage(content=prompt),
75
- HumanMessage(content=question)
76
  ]
77
 
78
- response = llm.invoke(message)
79
- response_content = response.content # Access the content of the AIMessage
 
 
 
 
80
  logger.info(f"LLM Response Content: {response_content}")
81
 
82
  # Extract sources from the response content
@@ -84,7 +87,7 @@ def feed_articles_to_gpt_with_links(information, question):
84
  parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)
85
 
86
  if not (inline_matches or parenthetical_matches):
87
- return response_content, [], [], []
88
 
89
  # Combine and get unique article numbers
90
  all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
@@ -122,7 +125,7 @@ def feed_articles_to_gpt_with_links(information, question):
122
  cited_published_dates = []
123
  for article_num in used_article_nums:
124
  uuid = uuids[article_num]
125
- link = f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/"
126
  cited_links.append(link)
127
  cited_titles.append(titles_list[article_num])
128
  cited_domains.append(domains_list[article_num])
@@ -133,6 +136,8 @@ if __name__ == "__main__":
133
  question = "How is United States fighting against tobacco addiction?"
134
  rerank_type = "crossencoder"
135
  llm_type = "chat"
136
- csv_path = save_solr_articles_full(question, keyword_type="rake")
 
 
137
  reranked_out = crossencoder_rerank_answer(csv_path, question)
138
  feed_articles_to_gpt_with_links(reranked_out, question)
 
1
+ from openai import OpenAI
 
 
 
 
 
2
  import tiktoken
3
  import re
4
 
 
6
  from rerank import crossencoder_rerank_answer
7
  import logging
8
  from logging.handlers import RotatingFileHandler
9
+ import os
10
 
11
  # Configure logging
12
  logger = logging.getLogger("TobaccoInfoAssistant")
 
19
  handler.setFormatter(formatter)
20
  logger.addHandler(handler)
21
 
22
+ # Initialize OpenAI client
23
+ client = OpenAI()
24
+
25
  def num_tokens_from_string(string: str, encoder) -> int:
26
  num_tokens = len(encoder.encode(string))
27
  return num_tokens
28
 
 
29
  def feed_articles_to_gpt_with_links(information, question):
30
  prompt = """
31
  You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.
 
58
  published_dates = [published_dates for score, contents, uuids, titles, domains, published_dates in information]
59
  logger.info(f"Article retrieved: {len(articles)}")
60
  logger.info(f"Article titles: {titles_list}")
61
+
62
  for i in range(len(articles)):
63
  addition = f"Article {i + 1}: {articles[i]} {separator}"
64
  token_count += num_tokens_from_string(addition, encoder)
 
68
 
69
  prompt += content
70
  logger.info(f"Prompt: {prompt}")
71
+
72
+ messages = [
73
+ {"role": "system", "content": prompt},
74
+ {"role": "user", "content": question}
75
  ]
76
 
77
+ completion = client.chat.completions.create(
78
+ model="gpt-4o-mini",
79
+ messages=messages,
80
+ temperature=0
81
+ )
82
+ response_content = completion.choices[0].message.content
83
  logger.info(f"LLM Response Content: {response_content}")
84
 
85
  # Extract sources from the response content
 
87
  parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)
88
 
89
  if not (inline_matches or parenthetical_matches):
90
+ return response_content, [], [], [], []
91
 
92
  # Combine and get unique article numbers
93
  all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
 
125
  cited_published_dates = []
126
  for article_num in used_article_nums:
127
  uuid = uuids[article_num]
128
+ link = f"https://tobaccowatcher.globaltobactocontrol.org/articles/{uuid}/"
129
  cited_links.append(link)
130
  cited_titles.append(titles_list[article_num])
131
  cited_domains.append(domains_list[article_num])
 
136
  question = "How is United States fighting against tobacco addiction?"
137
  rerank_type = "crossencoder"
138
  llm_type = "chat"
139
+ from get_articles import save_solr_articles_full
140
+ from rerank import crossencoder_rerank_answer
141
+ csv_path = save_solr_articles_full(question, 15, keyword_type="rake")
142
  reranked_out = crossencoder_rerank_answer(csv_path, question)
143
  feed_articles_to_gpt_with_links(reranked_out, question)
requirements.txt CHANGED
@@ -1,15 +1,9 @@
1
- gradio==4.25.0
2
- langchain==0.1.14
3
- langchain-core==0.1.40
4
- langchain-openai==0.1.1
5
- nltk==3.8.1
6
- openai==1.16.2
7
- pandas==2.2.1
8
- pysolr==3.9.0
9
- rake-nltk==1.0.6
10
- sentence-transformers==2.2.2
11
- tiktoken==0.5.2
12
- torch==2.1.2
13
- huggingface-hub==0.20.2
14
- python-dotenv==1.0.1
15
- docarray==0.40.0
 
1
+ gradio==4.25.0
2
+ openai
3
+ nltk==3.8.1
4
+ pandas==2.2.1
5
+ pysolr==3.9.0
6
+ rake-nltk==1.0.6
7
+ sentence-transformers==2.2.2
8
+ tiktoken==0.5.2
9
+ python-dotenv==1.0.1