Upload 6 files
Browse filesreplaced langchain with open ai
- app.py +54 -45
- feed_to_llm_v2.py +21 -16
- requirements.txt +9 -15
app.py
CHANGED
@@ -1,21 +1,18 @@
|
|
1 |
import gradio as gr
|
2 |
from full_chain import get_response
|
3 |
import os
|
4 |
-
import
|
|
|
5 |
import json
|
6 |
|
7 |
-
from langchain_openai import ChatOpenAI
|
8 |
-
from langchain.schema import HumanMessage, SystemMessage
|
9 |
import urllib3
|
10 |
|
11 |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
12 |
-
|
13 |
-
#
|
14 |
-
#
|
15 |
-
llm = ChatOpenAI(
|
16 |
-
model="gpt-4o-mini",
|
17 |
-
temperature=0
|
18 |
-
)
|
19 |
|
20 |
def load_content(filename):
|
21 |
"""Load content from text files"""
|
@@ -34,54 +31,62 @@ def predict(message, history):
|
|
34 |
"""Process user message and return appropriate response."""
|
35 |
try:
|
36 |
# Query classification prompt
|
37 |
-
|
|
|
38 |
|
39 |
-
|
40 |
-
|
41 |
|
42 |
-
|
43 |
-
|
44 |
|
45 |
-
|
46 |
-
|
47 |
|
48 |
-
|
49 |
-
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
messages = [
|
55 |
-
SystemMessage(content=classifier_prompt),
|
56 |
-
HumanMessage(content=message)
|
57 |
]
|
58 |
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
61 |
print(f"Query type: {query_type}")
|
62 |
|
63 |
if query_type == "HELP":
|
64 |
help_content = load_content("help.txt")
|
65 |
-
|
66 |
-
|
67 |
Use the provided help content to guide users on how to use the platform's features.
|
68 |
-
Be clear and specific in your instructions. If a feature isn't mentioned in the content, acknowledge that and suggest contacting support."""
|
69 |
-
|
70 |
]
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
74 |
|
75 |
elif query_type == "ABOUT":
|
76 |
about_content = load_content("about.txt")
|
77 |
-
|
78 |
-
|
79 |
Use the provided content to answer questions about Tobacco Watcher's purpose, mission, features, and organization.
|
80 |
-
Be concise but informative. If a specific detail isn't in the content, say so rather than making assumptions."""
|
81 |
-
|
82 |
]
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
85 |
|
86 |
elif query_type == "FILTER":
|
87 |
filter_options = load_filter_options()
|
@@ -108,14 +113,18 @@ Be concise but informative. If a specific detail isn't in the content, say so ra
|
|
108 |
|
109 |
url_prompt += "\nGenerate a valid URL for this query. Return ONLY the complete URL."
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
]
|
115 |
|
116 |
try:
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
119 |
print(f"Generated URL: {url_response}")
|
120 |
|
121 |
if url_response.startswith("http"):
|
|
|
1 |
import gradio as gr
|
2 |
from full_chain import get_response
|
3 |
import os
|
4 |
+
import json
|
5 |
+
from openai import OpenAI
|
6 |
import json
|
7 |
|
8 |
+
# from langchain_openai import ChatOpenAI
|
9 |
+
# from langchain.schema import HumanMessage, SystemMessage
|
10 |
import urllib3
|
11 |
|
12 |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
13 |
+
|
14 |
+
# Initialize OpenAI client
|
15 |
+
client = OpenAI() # It will automatically use OPENAI_API_KEY from environment
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def load_content(filename):
|
18 |
"""Load content from text files"""
|
|
|
31 |
"""Process user message and return appropriate response."""
|
32 |
try:
|
33 |
# Query classification prompt
|
34 |
+
classifier_messages = [
|
35 |
+
{"role": "system", "content": """You are the Tobacco Watcher Assistant. Analyze the user's query and categorize it into exactly ONE of these types:
|
36 |
|
37 |
+
1. HELP - Questions about using the website, its features, or navigation
|
38 |
+
Example: "How do I use filters?", "How to search for articles?"
|
39 |
|
40 |
+
2. ABOUT - Questions about Tobacco Watcher's purpose, mission, or organization
|
41 |
+
Example: "What is Tobacco Watcher?", "Who runs this website?"
|
42 |
|
43 |
+
3. FILTER - Requests for specific articles using filters
|
44 |
+
Example: "Show articles about smoking in India from 2023", "Find French articles about e-cigarettes"
|
45 |
|
46 |
+
4. QUERY - Questions seeking tobacco-related information
|
47 |
+
Example: "How many people smoke in Asia?", "What are the effects of secondhand smoke?"
|
48 |
|
49 |
+
Respond with ONLY the category name (HELP, ABOUT, FILTER, or QUERY)."""},
|
50 |
+
{"role": "user", "content": message}
|
|
|
|
|
|
|
|
|
51 |
]
|
52 |
|
53 |
+
completion = client.chat.completions.create(
|
54 |
+
model="gpt-4o-mini",
|
55 |
+
messages=classifier_messages,
|
56 |
+
temperature=0
|
57 |
+
)
|
58 |
+
query_type = completion.choices[0].message.content.strip().upper()
|
59 |
print(f"Query type: {query_type}")
|
60 |
|
61 |
if query_type == "HELP":
|
62 |
help_content = load_content("help.txt")
|
63 |
+
help_messages = [
|
64 |
+
{"role": "system", "content": """You are the Tobacco Watcher Help Assistant.
|
65 |
Use the provided help content to guide users on how to use the platform's features.
|
66 |
+
Be clear and specific in your instructions. If a feature isn't mentioned in the content, acknowledge that and suggest contacting support."""},
|
67 |
+
{"role": "user", "content": f"Using this help content:\n\n{help_content}\n\nAnswer this question: {message}"}
|
68 |
]
|
69 |
+
completion = client.chat.completions.create(
|
70 |
+
model="gpt-4o-mini",
|
71 |
+
messages=help_messages,
|
72 |
+
temperature=0
|
73 |
+
)
|
74 |
+
return completion.choices[0].message.content
|
75 |
|
76 |
elif query_type == "ABOUT":
|
77 |
about_content = load_content("about.txt")
|
78 |
+
about_messages = [
|
79 |
+
{"role": "system", "content": """You are the Tobacco Watcher Assistant specializing in explaining the platform.
|
80 |
Use the provided content to answer questions about Tobacco Watcher's purpose, mission, features, and organization.
|
81 |
+
Be concise but informative. If a specific detail isn't in the content, say so rather than making assumptions."""},
|
82 |
+
{"role": "user", "content": f"Using this content:\n\n{about_content}\n\nAnswer this question: {message}"}
|
83 |
]
|
84 |
+
completion = client.chat.completions.create(
|
85 |
+
model="gpt-4o-mini",
|
86 |
+
messages=about_messages,
|
87 |
+
temperature=0
|
88 |
+
)
|
89 |
+
return completion.choices[0].message.content
|
90 |
|
91 |
elif query_type == "FILTER":
|
92 |
filter_options = load_filter_options()
|
|
|
113 |
|
114 |
url_prompt += "\nGenerate a valid URL for this query. Return ONLY the complete URL."
|
115 |
|
116 |
+
url_messages = [
|
117 |
+
{"role": "system", "content": url_prompt},
|
118 |
+
{"role": "user", "content": message}
|
119 |
]
|
120 |
|
121 |
try:
|
122 |
+
completion = client.chat.completions.create(
|
123 |
+
model="gpt-4o-mini",
|
124 |
+
messages=url_messages,
|
125 |
+
temperature=0
|
126 |
+
)
|
127 |
+
url_response = completion.choices[0].message.content.strip()
|
128 |
print(f"Generated URL: {url_response}")
|
129 |
|
130 |
if url_response.startswith("http"):
|
feed_to_llm_v2.py
CHANGED
@@ -1,9 +1,4 @@
|
|
1 |
-
from
|
2 |
-
|
3 |
-
from langchain.schema import (
|
4 |
-
HumanMessage,
|
5 |
-
SystemMessage
|
6 |
-
)
|
7 |
import tiktoken
|
8 |
import re
|
9 |
|
@@ -11,6 +6,7 @@ from get_articles import save_solr_articles_full
|
|
11 |
from rerank import crossencoder_rerank_answer
|
12 |
import logging
|
13 |
from logging.handlers import RotatingFileHandler
|
|
|
14 |
|
15 |
# Configure logging
|
16 |
logger = logging.getLogger("TobaccoInfoAssistant")
|
@@ -23,11 +19,13 @@ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
23 |
handler.setFormatter(formatter)
|
24 |
logger.addHandler(handler)
|
25 |
|
|
|
|
|
|
|
26 |
def num_tokens_from_string(string: str, encoder) -> int:
|
27 |
num_tokens = len(encoder.encode(string))
|
28 |
return num_tokens
|
29 |
|
30 |
-
|
31 |
def feed_articles_to_gpt_with_links(information, question):
|
32 |
prompt = """
|
33 |
You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.
|
@@ -60,6 +58,7 @@ def feed_articles_to_gpt_with_links(information, question):
|
|
60 |
published_dates = [published_dates for score, contents, uuids, titles, domains, published_dates in information]
|
61 |
logger.info(f"Article retrieved: {len(articles)}")
|
62 |
logger.info(f"Article titles: {titles_list}")
|
|
|
63 |
for i in range(len(articles)):
|
64 |
addition = f"Article {i + 1}: {articles[i]} {separator}"
|
65 |
token_count += num_tokens_from_string(addition, encoder)
|
@@ -69,14 +68,18 @@ def feed_articles_to_gpt_with_links(information, question):
|
|
69 |
|
70 |
prompt += content
|
71 |
logger.info(f"Prompt: {prompt}")
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
]
|
77 |
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
80 |
logger.info(f"LLM Response Content: {response_content}")
|
81 |
|
82 |
# Extract sources from the response content
|
@@ -84,7 +87,7 @@ def feed_articles_to_gpt_with_links(information, question):
|
|
84 |
parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)
|
85 |
|
86 |
if not (inline_matches or parenthetical_matches):
|
87 |
-
return response_content, [], [], []
|
88 |
|
89 |
# Combine and get unique article numbers
|
90 |
all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
|
@@ -122,7 +125,7 @@ def feed_articles_to_gpt_with_links(information, question):
|
|
122 |
cited_published_dates = []
|
123 |
for article_num in used_article_nums:
|
124 |
uuid = uuids[article_num]
|
125 |
-
link = f"https://tobaccowatcher.
|
126 |
cited_links.append(link)
|
127 |
cited_titles.append(titles_list[article_num])
|
128 |
cited_domains.append(domains_list[article_num])
|
@@ -133,6 +136,8 @@ if __name__ == "__main__":
|
|
133 |
question = "How is United States fighting against tobacco addiction?"
|
134 |
rerank_type = "crossencoder"
|
135 |
llm_type = "chat"
|
136 |
-
|
|
|
|
|
137 |
reranked_out = crossencoder_rerank_answer(csv_path, question)
|
138 |
feed_articles_to_gpt_with_links(reranked_out, question)
|
|
|
1 |
+
from openai import OpenAI
|
|
|
|
|
|
|
|
|
|
|
2 |
import tiktoken
|
3 |
import re
|
4 |
|
|
|
6 |
from rerank import crossencoder_rerank_answer
|
7 |
import logging
|
8 |
from logging.handlers import RotatingFileHandler
|
9 |
+
import os
|
10 |
|
11 |
# Configure logging
|
12 |
logger = logging.getLogger("TobaccoInfoAssistant")
|
|
|
19 |
handler.setFormatter(formatter)
|
20 |
logger.addHandler(handler)
|
21 |
|
22 |
+
# Initialize OpenAI client
|
23 |
+
client = OpenAI()
|
24 |
+
|
25 |
def num_tokens_from_string(string: str, encoder) -> int:
|
26 |
num_tokens = len(encoder.encode(string))
|
27 |
return num_tokens
|
28 |
|
|
|
29 |
def feed_articles_to_gpt_with_links(information, question):
|
30 |
prompt = """
|
31 |
You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.
|
|
|
58 |
published_dates = [published_dates for score, contents, uuids, titles, domains, published_dates in information]
|
59 |
logger.info(f"Article retrieved: {len(articles)}")
|
60 |
logger.info(f"Article titles: {titles_list}")
|
61 |
+
|
62 |
for i in range(len(articles)):
|
63 |
addition = f"Article {i + 1}: {articles[i]} {separator}"
|
64 |
token_count += num_tokens_from_string(addition, encoder)
|
|
|
68 |
|
69 |
prompt += content
|
70 |
logger.info(f"Prompt: {prompt}")
|
71 |
+
|
72 |
+
messages = [
|
73 |
+
{"role": "system", "content": prompt},
|
74 |
+
{"role": "user", "content": question}
|
75 |
]
|
76 |
|
77 |
+
completion = client.chat.completions.create(
|
78 |
+
model="gpt-4o-mini",
|
79 |
+
messages=messages,
|
80 |
+
temperature=0
|
81 |
+
)
|
82 |
+
response_content = completion.choices[0].message.content
|
83 |
logger.info(f"LLM Response Content: {response_content}")
|
84 |
|
85 |
# Extract sources from the response content
|
|
|
87 |
parenthetical_matches = re.findall(r'\(Article \d+\)', response_content)
|
88 |
|
89 |
if not (inline_matches or parenthetical_matches):
|
90 |
+
return response_content, [], [], [], []
|
91 |
|
92 |
# Combine and get unique article numbers
|
93 |
all_matches = inline_matches + [m.strip('()') for m in parenthetical_matches]
|
|
|
125 |
cited_published_dates = []
|
126 |
for article_num in used_article_nums:
|
127 |
uuid = uuids[article_num]
|
128 |
+
link = f"https://tobaccowatcher.globaltobactocontrol.org/articles/{uuid}/"
|
129 |
cited_links.append(link)
|
130 |
cited_titles.append(titles_list[article_num])
|
131 |
cited_domains.append(domains_list[article_num])
|
|
|
136 |
question = "How is United States fighting against tobacco addiction?"
|
137 |
rerank_type = "crossencoder"
|
138 |
llm_type = "chat"
|
139 |
+
from get_articles import save_solr_articles_full
|
140 |
+
from rerank import crossencoder_rerank_answer
|
141 |
+
csv_path = save_solr_articles_full(question, 15, keyword_type="rake")
|
142 |
reranked_out = crossencoder_rerank_answer(csv_path, question)
|
143 |
feed_articles_to_gpt_with_links(reranked_out, question)
|
requirements.txt
CHANGED
@@ -1,15 +1,9 @@
|
|
1 |
-
gradio==4.25.0
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
sentence-transformers==2.2.2
|
11 |
-
tiktoken==0.5.2
|
12 |
-
torch==2.1.2
|
13 |
-
huggingface-hub==0.20.2
|
14 |
-
python-dotenv==1.0.1
|
15 |
-
docarray==0.40.0
|
|
|
1 |
+
gradio==4.25.0
|
2 |
+
openai
|
3 |
+
nltk==3.8.1
|
4 |
+
pandas==2.2.1
|
5 |
+
pysolr==3.9.0
|
6 |
+
rake-nltk==1.0.6
|
7 |
+
sentence-transformers==2.2.2
|
8 |
+
tiktoken==0.5.2
|
9 |
+
python-dotenv==1.0.1
|
|
|
|
|
|
|
|
|
|
|
|