|
from queue import SimpleQueue |
|
from dotenv import load_dotenv |
|
import re |
|
from langchain.callbacks.base import BaseCallbackHandler |
|
|
|
job_done = object() |
|
|
|
|
|
class StreamingGradioCallbackHandler(BaseCallbackHandler): |
|
"""Callback handler for streaming. Only works with LLMs that support streaming.""" |
|
|
|
def __init__(self, q): |
|
self.q = q |
|
|
|
def on_llm_start(self, serialized, prompts, **kwargs) -> None: |
|
"""Run when LLM starts running.""" |
|
while not self.q.empty(): |
|
try: |
|
self.q.get(block=False) |
|
except SimpleQueue.empty: |
|
continue |
|
|
|
def on_llm_new_token(self, token, **kwargs) -> None: |
|
"""Run on new LLM token. Only available when streaming is enabled.""" |
|
self.q.put(token) |
|
|
|
def on_llm_end(self, response, **kwargs) -> None: |
|
"""Run when LLM ends running.""" |
|
self.q.put(job_done) |
|
|
|
def on_llm_error(self, error, **kwargs) -> None: |
|
"""Run when LLM errors.""" |
|
self.q.put(job_done) |
|
|
|
|
|
def add_gradio_streaming(llm): |
|
q = SimpleQueue() |
|
job_done = object() |
|
llm.callbacks = [StreamingGradioCallbackHandler(q)] |
|
return llm, q |
|
|
|
|
|
def gradio_stream(llm, prompt): |
|
thread = Thread(target=llm.predict, kwargs={"text": prompt}) |
|
thread.start() |
|
text = "" |
|
while True: |
|
next_token = q.get(block=True) |
|
if next_token is job_done: |
|
break |
|
text += next_token |
|
time.sleep(0.03) |
|
yield text |
|
thread.join() |
|
|
|
|
|
def get_source_link(metadata): |
|
return metadata["file_url"] + f"#page={metadata['content_page_number'] + 1}" |
|
|
|
|
|
def make_html_source(source, i, score, config): |
|
meta = source.metadata |
|
if meta["file_source_type"] == "AFP": |
|
return f""" |
|
<div class="card" id="doc{i}"> |
|
<div class="card-content"> |
|
<h2>Doc {i} - {meta['file_title']} - {meta['file_type']} AFP</h2> |
|
<p>{source.page_content}</p> |
|
</div> |
|
<div class="card-footer"> |
|
<span>{meta['file_source_type']}</span> |
|
<span>Relevance Score : {round(100*score,1)}%</span> |
|
</div> |
|
</div> |
|
""" |
|
|
|
if meta["file_source_type"] == "Presse": |
|
if meta["file_url"] != "none": |
|
return f""" |
|
<div class="card" id="doc{i}"> |
|
<div class="card-content"> |
|
<h2>Doc {i} - {meta['file_title']} - {meta['file_publisher']}</h2> |
|
<p>{source.page_content}</p> |
|
</div> |
|
<div class="card-footer"> |
|
<span>{meta['file_source_type']}</span> |
|
<span>Relevance Score : {round(100*score,1)}%</span> |
|
<a href={meta['file_url']} target="_blank"> |
|
<span role="img" aria-label="Open PDF">🔗</span> |
|
</a> |
|
</div> |
|
</div> |
|
""" |
|
else: |
|
return f""" |
|
<div class="card" id="doc{i}"> |
|
<div class="card-content"> |
|
<h2>Doc {i} - {meta['file_title']} - {meta['file_publisher']}</h2> |
|
<p>{source.page_content}</p> |
|
</div> |
|
<div class="card-footer"> |
|
<span>{meta['file_source_type']}</span> |
|
<span>Relevance Score : {round(100*score,1)}%</span> |
|
</div> |
|
</div> |
|
""" |
|
|
|
if meta["file_url"]: |
|
return f""" |
|
<div class="card" id="doc{i}"> |
|
<div class="card-content"> |
|
<h2>Doc {i} - {meta['file_title']} - Page {meta['content_page_number'] + 1}</h2> |
|
<p>{source.page_content.replace(config["passage_preprompt"], "")}</p> |
|
</div> |
|
<div class="card-footer"> |
|
<span>{meta['file_source_type']}</span> |
|
<span>Relevance Score : {round(100*score,1)}%</span> |
|
<a href="{get_source_link(meta)}" target="_blank"> |
|
<span role="img" aria-label="Open PDF">🔗</span> |
|
</a> |
|
</div> |
|
</div> |
|
""" |
|
else: |
|
return f""" |
|
<div class="card" id="doc{i}"> |
|
<div class="card-content"> |
|
<h2>Doc {i} - {meta['file_title']} - Page {meta['content_page_number'] + 1}</h2> |
|
<p>{source.page_content.replace(config["passage_preprompt"], "")}</p> |
|
</div> |
|
<div class="card-footer"> |
|
<span>{meta['file_source_type']}</span> |
|
<span>Relevance Score : {round(100*score,1)}%</span> |
|
</div> |
|
</div> |
|
""" |
|
|
|
|
|
def parse_output_llm_with_sources(output): |
|
content_parts = re.split( |
|
r"[\[(]?(Doc\s?\d+(?:,\s?Doc\s?\d+)*|doc\s?\d+(?:,\s?doc\s?\d+)*|Doc\s\d+)[\])?]", |
|
output, |
|
) |
|
parts = [] |
|
for part in content_parts: |
|
if part.lower().startswith("doc"): |
|
subparts = part.split(",") |
|
subparts = [ |
|
subpart.lower().replace("doc", "").strip() for subpart in subparts |
|
] |
|
subparts = [ |
|
f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" |
|
for subpart in subparts |
|
] |
|
parts.append("".join(subparts)) |
|
else: |
|
parts.append(part) |
|
content_parts = "".join(parts) |
|
|
|
return content_parts |
|
|
|
|
|
def clear_text_box(textbox): |
|
return "" |
|
|
|
|
|
def add_text(chatbot, text): |
|
chatbot = chatbot + [(text, None)] |
|
return chatbot, text |
|
|
|
|
|
def init_env(): |
|
try: |
|
load_dotenv() |
|
except: |
|
pass |
|
|