Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

eljanmahammadli commited on Sep 7, 2024

Commit

b26a983

2 Parent(s): 88a1d09 80a07a7

Merge branch 'staging'

Browse files

Files changed (4) hide show

ai_generate.py +246 -18
app.py +740 -463
humanize.py +58 -10
requirements.txt +2 -1

ai_generate.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 from langchain_community.document_loaders import PyMuPDFLoader
 from langchain_core.documents import Document
@@ -15,6 +16,16 @@ from langchain_openai import ChatOpenAI
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_anthropic import ChatAnthropic
 from dotenv import load_dotenv
 load_dotenv()
@@ -26,7 +37,17 @@ os.environ["GLOG_minloglevel"] = "2"
 CHUNK_SIZE = 1024
 CHUNK_OVERLAP = CHUNK_SIZE // 8
 K = 10
-FETCH_K = 20
 llm_model_translation = {
     "LLaMA 3": "llama3-70b-8192",
@@ -47,6 +68,138 @@ llm_classes = {
 }
 def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int = 2048):
     model_name = llm_model_translation.get(model)
     llm_class = llm_classes.get(model_name)
@@ -60,10 +213,9 @@ def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int
     return llm
-def create_db_with_langchain(path: list[str], url_content: dict):
     all_docs = []
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
-    embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
     if path:
         for file in path:
             loader = PyMuPDFLoader(file)
@@ -79,18 +231,38 @@ def create_db_with_langchain(path: list[str], url_content: dict):
             docs = text_splitter.split_documents([doc])
             all_docs.extend(docs)
     # print docs
     for idx, doc in enumerate(all_docs):
         print(f"Doc: {idx} | Length = {len(doc.page_content)}")
     assert len(all_docs) > 0, "No PDFs or scrapped data provided"
     db = Chroma.from_documents(all_docs, embedding_function)
     return db
 def generate_rag(
     prompt: str,
     topic: str,
     model: str,
     url_content: dict,
     path: list[str],
@@ -103,19 +275,25 @@ def generate_rag(
     if llm is None:
         print("Failed to load LLM. Aborting operation.")
         return None
-    db = create_db_with_langchain(path, url_content)
-    retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K})
-    rag_prompt = hub.pull("rlm/rag-prompt")
-    def format_docs(docs):
-        return "\n\n".join(doc.page_content for doc in docs)
-    docs = retriever.get_relevant_documents(topic)
-    formatted_docs = format_docs(docs)
-    rag_chain = (
-        {"context": lambda _: formatted_docs, "question": RunnablePassthrough()} | rag_prompt | llm | StrOutputParser()
-    )
-    return rag_chain.invoke(prompt)
 def generate_base(
@@ -124,18 +302,21 @@ def generate_base(
     llm = load_llm(model, api_key, temperature, max_length)
     if llm is None:
         print("Failed to load LLM. Aborting operation.")
-        return None
     try:
         output = llm.invoke(prompt).content
-        return output
     except Exception as e:
         print(f"An error occurred while running the model: {e}")
-        return None
 def generate(
     prompt: str,
     topic: str,
     model: str,
     url_content: dict,
     path: list[str],
@@ -145,6 +326,53 @@ def generate(
     sys_message="",
 ):
     if path or url_content:
-        return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
         return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

+import gc
 import os
 from langchain_community.document_loaders import PyMuPDFLoader
 from langchain_core.documents import Document
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_anthropic import ChatAnthropic
 from dotenv import load_dotenv
+from langchain_core.output_parsers import XMLOutputParser
+from langchain.prompts import ChatPromptTemplate
+import re
+import numpy as np
+import torch
+import bm25s
+from langchain_community.cross_encoders import HuggingFaceCrossEncoder
+from langchain.retrievers import ContextualCompressionRetriever
+from langchain.retrievers.document_compressors import CrossEncoderReranker
+from langchain_core.messages import HumanMessage
 load_dotenv()
 CHUNK_SIZE = 1024
 CHUNK_OVERLAP = CHUNK_SIZE // 8
 K = 10
+FETCH_K = 50
+model_kwargs = {"device": "cuda:1"}
+print("Loading embedding and reranker models...")
+embedding_function = SentenceTransformerEmbeddings(
+    model_name="mixedbread-ai/mxbai-embed-large-v1", model_kwargs=model_kwargs
+)
+# "sentence-transformers/all-MiniLM-L6-v2"
+# "mixedbread-ai/mxbai-embed-large-v1"
+reranker = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base", model_kwargs=model_kwargs)
+compressor = CrossEncoderReranker(model=reranker, top_n=K)
 llm_model_translation = {
     "LLaMA 3": "llama3-70b-8192",
 }
+xml_system = """You're a helpful AI assistant. Given a user prompt and some related sources, fulfill all the requirements \
+of the prompt and provide citations. If a chunk of the generated text does not use any of the sources (for example, \
+introductions or general text), don't put a citation for that chunk and just leave "citations" section empty. Otherwise, \
+list all sources used for that chunk of the text. Remember, don't add inline citations in the text itself in any circumstant.
+Add all citations to the separate citations section. Use explicit new lines in the text to show paragraph splits. For each chunk use this example format:
+<chunk>
+    <text>This is a sample text chunk....</text>
+    <citations>
+        <citation>1</citation>
+        <citation>3</citation>
+        ...
+    </citations>
+</chunk>
+If the prompt asks for a reference section, add it in a chunk without any citations
+Return a citation for every quote across all articles that justify the text. Remember use the following format for your final output:
+<cited_text>
+    <chunk>
+        <text></text>
+        <citations>
+            <citation><source_id></source_id></citation>
+            ...
+        </citations>
+    </chunk>
+    <chunk>
+        <text></text>
+        <citations>
+            <citation><source_id></source_id></citation>
+            ...
+        </citations>
+    </chunk>
+    ...
+</cited_text>
+The entire text should be wrapped in one cited_text. For References section (if asked by prompt), don't add citations.
+For source id, give a valid integer alone without a key.
+Here are the sources:{context}"""
+xml_prompt = ChatPromptTemplate.from_messages([("system", xml_system), ("human", "{input}")])
+def format_docs_xml(docs: list[Document]) -> str:
+    formatted = []
+    for i, doc in enumerate(docs):
+        doc_str = f"""\
+    <source id=\"{i}\">
+        <path>{doc.metadata['source']}</path>
+        <article_snippet>{doc.page_content}</article_snippet>
+    </source>"""
+        formatted.append(doc_str)
+    return "\n\n<sources>" + "\n".join(formatted) + "</sources>"
+def get_doc_content(docs, id):
+    return docs[id].page_content
+def remove_citations(text):
+    text = re.sub(r"<\d+>", "", text)
+    return text
+def display_cited_text(data):
+    combined_text = ""
+    citations = {}
+    # Iterate through the cited_text list
+    if "cited_text" in data:
+        for item in data["cited_text"]:
+            if "chunk" in item and len(item["chunk"]) > 0:
+                chunk_text = item["chunk"][0].get("text")
+                combined_text += chunk_text
+                citation_ids = []
+                # Process the citations for the chunk
+                if len(item["chunk"]) > 1 and item["chunk"][1]["citations"]:
+                    for c in item["chunk"][1]["citations"]:
+                        if c and "citation" in c:
+                            citation = c["citation"]
+                            if isinstance(citation, dict) and "source_id" in citation:
+                                citation = citation["source_id"]
+                            if isinstance(citation, str):
+                                try:
+                                    citation_ids.append(int(citation))
+                                except ValueError:
+                                    pass  # Handle cases where the string is not a valid integer
+            if citation_ids:
+                citation_texts = [f"<{cid}>" for cid in citation_ids]
+                combined_text += " " + "".join(citation_texts)
+            combined_text += "\n\n"
+    return combined_text
+def get_citations(data, docs):
+    # Initialize variables for the combined text and a dictionary for citations
+    citations = {}
+    # Iterate through the cited_text list
+    if data.get("cited_text"):
+        for item in data["cited_text"]:
+            citation_ids = []
+            if "chunk" in item and len(item["chunk"]) > 1 and item["chunk"][1].get("citations"):
+                for c in item["chunk"][1]["citations"]:
+                    if c and "citation" in c:
+                        citation = c["citation"]
+                        if isinstance(citation, dict) and "source_id" in citation:
+                            citation = citation["source_id"]
+                        if isinstance(citation, str):
+                            try:
+                                citation_ids.append(int(citation))
+                            except ValueError:
+                                pass  # Handle cases where the string is not a valid integer
+            # Store unique citations in a dictionary
+            for citation_id in citation_ids:
+                if citation_id not in citations:
+                    citations[citation_id] = {
+                        "source": docs[citation_id].metadata["source"],
+                        "content": docs[citation_id].page_content,
+                    }
+    return citations
+def citations_to_html(citations):
+    if citations:
+        # Generate the HTML for the unique citations
+        html_content = ""
+        for citation_id, citation_info in citations.items():
+            html_content += (
+                f"<li><strong>Source ID:</strong> {citation_id}<br>"
+                f"<strong>Path:</strong> {citation_info['source']}<br>"
+                f"<strong>Page Content:</strong> {citation_info['content']}</li>"
+            )
+        html_content += "</ul></body></html>"
+        return html_content
+    return ""
 def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int = 2048):
     model_name = llm_model_translation.get(model)
     llm_class = llm_classes.get(model_name)
     return llm
+def create_db_with_langchain(path: list[str], url_content: dict, query: str):
     all_docs = []
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
     if path:
         for file in path:
             loader = PyMuPDFLoader(file)
             docs = text_splitter.split_documents([doc])
             all_docs.extend(docs)
+    print(f"### Total number of documents before bm25s: {len(all_docs)}")
+    # if the number of docs is too high, we need to reduce it
+    num_max_docs = 300
+    if len(all_docs) > num_max_docs:
+        docs_raw = [doc.page_content for doc in all_docs]
+        retriever = bm25s.BM25(corpus=docs_raw)
+        retriever.index(bm25s.tokenize(docs_raw))
+        results, scores = retriever.retrieve(bm25s.tokenize(query), k=len(docs_raw), sorted=False)
+        top_indices = np.argpartition(scores[0], -num_max_docs)[-num_max_docs:]
+        all_docs = [all_docs[i] for i in top_indices]
     # print docs
     for idx, doc in enumerate(all_docs):
         print(f"Doc: {idx} | Length = {len(doc.page_content)}")
     assert len(all_docs) > 0, "No PDFs or scrapped data provided"
     db = Chroma.from_documents(all_docs, embedding_function)
+    torch.cuda.empty_cache()
+    gc.collect()
     return db
+def pretty_print_docs(docs):
+    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))
 def generate_rag(
     prompt: str,
+    input_role: str,
     topic: str,
+    context: str,
     model: str,
     url_content: dict,
     path: list[str],
     if llm is None:
         print("Failed to load LLM. Aborting operation.")
         return None
+    query = llm_wrapper(input_role, topic, context, model="OpenAI GPT 4o", task_type="rag", temperature=0.7)
+    print("### Query: ", query)
+    db = create_db_with_langchain(path, url_content, query)
+    retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K, "lambda_mult": 0.75})
+    # docs = retriever.get_relevant_documents(query)
+    compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
+    docs = compression_retriever.invoke(query)
+    print(pretty_print_docs(docs))
+    formatted_docs = format_docs_xml(docs)
+    rag_chain = RunnablePassthrough.assign(context=lambda _: formatted_docs) | xml_prompt | llm | XMLOutputParser()
+    result = rag_chain.invoke({"input": prompt})
+    citations = get_citations(result, docs)
+    db.delete_collection()  # important, othwerwise it will keep the documents in memory
+    torch.cuda.empty_cache()
+    gc.collect()
+    return result, citations
 def generate_base(
     llm = load_llm(model, api_key, temperature, max_length)
     if llm is None:
         print("Failed to load LLM. Aborting operation.")
+        return None, None
     try:
         output = llm.invoke(prompt).content
+        output_dict = {"cited_text": [{"chunk": [{"text": output}, {"citations": None}]}]}
+        return output_dict, None
     except Exception as e:
         print(f"An error occurred while running the model: {e}")
+        return None, None
 def generate(
     prompt: str,
+    input_role: str,
     topic: str,
+    context: str,
     model: str,
     url_content: dict,
     path: list[str],
     sys_message="",
 ):
     if path or url_content:
+        return generate_rag(
+            prompt, input_role, topic, context, model, url_content, path, temperature, max_length, api_key, sys_message
+        )
     else:
         return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
+def llm_wrapper(
+    iam=None,
+    topic=None,
+    context=None,
+    temperature=1.0,
+    max_length=512,
+    api_key="",
+    model="OpenAI GPT 4o Mini",
+    task_type="internet",
+):
+    llm = load_llm(model, api_key, temperature, max_length)
+    if task_type == "rag":
+        system_message_content = """You are an AI assistant tasked with reformulating user inputs to improve retrieval query in a RAG system.
+- Given the original user inputs, construct query to be more specific, detailed, and likely to retrieve relevant information.
+- Generate the query as a complete sentence or question, not just as keywords, to ensure the retrieval process can find detailed and contextually relevant information.
+- You may enhance the query by adding related and relevant terms, but do not introduce new facts, such as dates, numbers, or assumed information, that were not provided in the input.
+**Inputs:**
+- **User Role**: {iam}
+- **Topic**: {topic}
+- **Context**: {context}
+**Only return the search query**."""
+    elif task_type == "internet":
+        system_message_content = """You are an AI assistant tasked with generating an optimized Google search query to help retrieve relevant websites, news, articles, and other sources of information.
+- You may enhance the query by adding related and relevant terms, but do not introduce new facts, such as dates, numbers, or assumed information, that were not provided in the input.
+- The query should be **concise** and include important **keywords** while incorporating **short phrases** or context where it improves the search.
+- Avoid the use of "site:" operators or narrowing search by specific websites.
+**Inputs:**
+- **User Role**: {iam}
+- **Topic**: {topic}
+- **Context**: {context}
+**Only return the search query**.
+"""
+    else:
+        raise ValueError("Task type not recognized. Please specify 'rag' or 'internet'.")
+    human_message = HumanMessage(content=system_message_content.format(iam=iam, topic=topic, context=context))
+    response = llm.invoke([human_message])
+    return response.content.strip('"').strip("'")

app.py CHANGED Viewed

@@ -3,41 +3,261 @@ nohup python3 app.py &
 export GOOGLE_APPLICATION_CREDENTIALS="gcp_creds.json"
 """
 import re
 from typing import Dict
 from collections import defaultdict
 from datetime import date, datetime
-import gradio as gr
 import nltk
 import torch
 import numpy as np
-from scipy.special import softmax
 import language_tool_python
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
-from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
-from google_search import google_search, months, domain_list, build_date
-from humanize import humanize_text, device
-from ai_generate import generate
-print(f"Using device: {device}")
-models = {
-    "Polygraf AI (Base Model)": AutoModelForSequenceClassification.from_pretrained(
-        "polygraf-ai/bc-roberta-openai-2sent"
-    ).to(device),
-    "Polygraf AI (Advanced Model)": AutoModelForSequenceClassification.from_pretrained(
-        "polygraf-ai/bc_combined_3sent"
-    ).to(device),
-}
-tokenizers = {
-    "Polygraf AI (Base Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc-roberta-openai-2sent"),
-    "Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
-}
-# grammar correction tool
-tool = language_tool_python.LanguageTool("en-US")
 # Function to move model to the appropriate device
@@ -62,7 +282,8 @@ def clean_text(text: str) -> str:
         cleaned = re.sub(r"\s+", " ", paragraph).strip()
         cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
         cleaned_paragraphs.append(cleaned)
-    return "\n".join(cleaned_paragraphs)
 def format_references(text: str) -> str:
@@ -137,6 +358,8 @@ def predict(model, tokenizer, text):
         output = model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
         output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
         return output_norm
@@ -196,13 +419,6 @@ ai_check_options = [
 ]
-MC_TOKEN_SIZE = 256
-TEXT_MC_MODEL_PATH = "polygraf-ai/mc-model"
-MC_LABEL_MAP = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "Grammar Enhancer"]
-text_mc_tokenizer = AutoTokenizer.from_pretrained(TEXT_MC_MODEL_PATH)
-text_mc_model = AutoModelForSequenceClassification.from_pretrained(TEXT_MC_MODEL_PATH).to(device)
 def predict_mc(text):
     with torch.no_grad():
         text_mc_model.eval()
@@ -215,6 +431,8 @@ def predict_mc(text):
         ).to(device)
         output = text_mc_model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
         return output_norm
@@ -244,6 +462,7 @@ def predict_mc_scores(input, bc_score):
 def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
     body, references = split_text_from_refs(text)
     score, text = detection_polygraf(text=body, model=model)
     mc_score = predict_mc_scores(body, score)  # mc score
@@ -251,7 +470,8 @@ def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
     return score, text, mc_score
-def ai_check(text: str, option: str):
     if option.startswith("Polygraf AI"):
         return highlighter_polygraf(text, option)
     else:
@@ -259,35 +479,39 @@ def ai_check(text: str, option: str):
 def generate_prompt(settings: Dict[str, str]) -> str:
     prompt = f"""
-    I am a {settings['role']}
-    Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
     Context:
     - {settings['context']}
     Style and Tone:
     - Writing style: {settings['writing_style']}
     - Tone: {settings['tone']}
     - Target audience: {settings['user_category']}
     Content:
     - Depth: {settings['depth_of_content']}
     - Structure: {', '.join(settings['structure'])}
     Keywords to incorporate:
     {', '.join(settings['keywords'])}
     Additional requirements:
     - Don't start with "Here is a...", start with the requested text directly
-    - Include {settings['num_examples']} relevant examples or case studies
-    - Incorporate data or statistics from {', '.join(settings['references'])}
     - End with a {settings['conclusion_type']} conclusion
-    - Add a "References" section in the format "References:" on a new line at the end with at least 3 credible detailed sources, formatted as [1], [2], etc. with each source on their own line
-    - Do not repeat sources
     - Do not make any headline, title bold.
-    Ensure proper paragraph breaks for better readability.
-    Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
     """
     return prompt
@@ -299,7 +523,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
     Edit the given text based on user comments.
     User Comments:
     - {settings['user_comments']}
     Requirements:
     - Don't start with "Here is a...", start with the requested text directly
     - The original content should not be changed. Make minor modifications based on user comments above.
@@ -307,7 +531,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
     - Do not make any headline, title bold.
     Context:
     - {settings['context']}
     Ensure proper paragraph breaks for better readability.
     Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
     """
@@ -361,23 +585,29 @@ def generate_article(
         prompt = generate_prompt(settings)
     print("Generated Prompt...\n", prompt)
-    article = generate(
         prompt=prompt,
         topic=topic,
         model=ai_model,
         url_content=url_content,
         path=pdf_file_input,
         temperature=1,
         max_length=2048,
         api_key=api_key,
         sys_message="",
     )
-    return clean_text(article)
 def get_history(history):
-    return history
 def clear_history():
@@ -386,8 +616,8 @@ def clear_history():
 def humanize(
-    text: str,
     model: str,
     temperature: float = 1.2,
     repetition_penalty: float = 1,
     top_k: int = 50,
@@ -395,21 +625,35 @@ def humanize(
     history=None,
 ) -> str:
     print("Humanizing text...")
-    body, references = split_text_from_refs(text)
-    result = humanize_text(
-        text=body,
         model_name=model,
         temperature=temperature,
         repetition_penalty=repetition_penalty,
         top_k=top_k,
         length_penalty=length_penalty,
     )
-    result = result + references
-    corrected_text = format_and_correct_language_check(result)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    history.append((f"Humanized Text | {timestamp}\nInput: {model}", corrected_text))
-    return corrected_text, history
 def update_visibility_api(model: str):
@@ -445,11 +689,6 @@ def update_temperature(model_dropdown):
         return gr.update(value=1.0, interactive=True)
-import uuid
-import json
-from datetime import datetime
-from google.cloud import storage
 # Initialize Google Cloud Storage client
 client = storage.Client()
 bucket_name = "ai-source-detection"
@@ -460,7 +699,6 @@ def save_to_cloud_storage(
     article,
     topic,
     input_role,
-    topic_context,
     context,
     keywords,
     article_length,
@@ -493,7 +731,6 @@ def save_to_cloud_storage(
         "metadata": {
             "topic": topic,
             "input_role": input_role,
-            "topic_context": topic_context,
             "context": context,
             "keywords": keywords,
             "article_length": article_length,
@@ -524,6 +761,31 @@ def save_to_cloud_storage(
     return f"Data saved as {file_name} in GCS."
 def generate_and_format(
     input_role,
     topic,
@@ -561,7 +823,9 @@ def generate_and_format(
         date_from = build_date(year_from, month_from, day_from)
         date_to = build_date(year_to, month_to, day_to)
         sorted_date = f"date:r:{date_from}:{date_to}"
-        final_query = topic
         if include_sites:
             site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
             final_query += " " + " OR ".join(site_queries)
@@ -570,10 +834,10 @@ def generate_and_format(
             final_query += " " + " ".join(exclude_queries)
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
-    topic_context = topic + ", " + context
-    article = generate_article(
         input_role,
-        topic_context,
         context,
         keywords,
         article_length,
@@ -593,13 +857,14 @@ def generate_and_format(
         generated_article,
         user_comments,
     )
-    if ends_with_references(article) and url_content is not None:
-        for url in url_content.keys():
-            article += f"\n{url}"
-    reference_formatted = format_references(article)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    history.append((f"Generated Text | {timestamp}\nInput: {topic}", reference_formatted))
     # Save the article and metadata to Cloud Storage
     # We dont save if there is PDF input for privacy reasons
@@ -608,7 +873,6 @@ def generate_and_format(
             article,
             topic,
             input_role,
-            topic_context,
             context,
             keywords,
             article_length,
@@ -628,415 +892,428 @@ def generate_and_format(
             timestamp,
         )
         print(save_message)
-    return reference_formatted, history
-def create_interface():
-    with gr.Blocks(
-        theme=gr.themes.Default(
-            primary_hue=gr.themes.colors.pink, secondary_hue=gr.themes.colors.yellow, neutral_hue=gr.themes.colors.gray
-        ),
-        css="""
-            .input-highlight-pink block_label {background-color: #008080}
-            """,
-    ) as demo:
-        history = gr.State([])
-        today = date.today()
-        # dd/mm/YY
-        d1 = today.strftime("%d/%B/%Y")
-        d1 = d1.split("/")
-        gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
-        with gr.Row():
-            with gr.Column(scale=2):
-                with gr.Group():
-                    gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4")
-                    input_role = gr.Textbox(label="I am a", placeholder="Enter your role", value="Student")
-                    input_topic = gr.Textbox(
-                        label="Topic",
-                        placeholder="Enter the main topic of your article",
-                        elem_classes="input-highlight-pink",
-                    )
-                    input_context = gr.Textbox(
-                        label="Context",
-                        placeholder="Provide some context for your topic",
-                        elem_classes="input-highlight-pink",
                     )
-                    input_keywords = gr.Textbox(
-                        label="Keywords",
-                        placeholder="Enter comma-separated keywords",
                         elem_classes="input-highlight-yellow",
                     )
                     with gr.Row():
-                        input_format = gr.Dropdown(
-                            choices=[
-                                "Article",
-                                "Essay",
-                                "Blog post",
-                                "Report",
-                                "Research paper",
-                                "News article",
-                                "White paper",
-                                "Email",
-                                "LinkedIn post",
-                                "X (Twitter) post",
-                                "Instagram Video Content",
-                                "TikTok Video Content",
-                                "Facebook post",
-                            ],
-                            value="Article",
-                            label="Format",
-                            elem_classes="input-highlight-turquoise",
                         )
-                    input_length = gr.Slider(
-                        minimum=50,
-                        maximum=5000,
-                        step=50,
-                        value=300,
-                        label="Article Length",
-                        elem_classes="input-highlight-pink",
-                    )
                     with gr.Row():
-                        input_writing_style = gr.Dropdown(
-                            choices=[
-                                "Formal",
-                                "Informal",
-                                "Technical",
-                                "Conversational",
-                                "Journalistic",
-                                "Academic",
-                                "Creative",
-                            ],
-                            value="Formal",
-                            label="Writing Style",
                             elem_classes="input-highlight-yellow",
                         )
-                        input_tone = gr.Dropdown(
-                            choices=["Friendly", "Professional", "Neutral", "Enthusiastic", "Skeptical", "Humorous"],
-                            value="Professional",
-                            label="Tone",
-                            elem_classes="input-highlight-turquoise",
                         )
-                    input_user_category = gr.Dropdown(
-                        choices=[
-                            "Students",
-                            "Professionals",
-                            "Researchers",
-                            "General Public",
-                            "Policymakers",
-                            "Entrepreneurs",
-                        ],
-                        value="General Public",
-                        label="Target Audience",
-                        elem_classes="input-highlight-pink",
-                    )
-                    input_depth = gr.Dropdown(
-                        choices=[
-                            "Surface-level overview",
-                            "Moderate analysis",
-                            "In-depth research",
-                            "Comprehensive study",
-                        ],
-                        value="Moderate analysis",
-                        label="Depth of Content",
-                        elem_classes="input-highlight-yellow",
-                    )
-                    input_structure = gr.Dropdown(
-                        choices=[
-                            "Introduction, Body, Conclusion",
-                            "Abstract, Introduction, Methods, Results, Discussion, Conclusion",
-                            "Executive Summary, Problem Statement, Analysis, Recommendations, Conclusion",
-                            "Introduction, Literature Review, Methodology, Findings, Analysis, Conclusion",
-                            "Plain Text",
-                        ],
-                        value="Introduction, Body, Conclusion",
-                        label="Structure",
-                        elem_classes="input-highlight-turquoise",
-                        interactive=True,
-                    )
-                    input_references = gr.Dropdown(
-                        choices=[
-                            "Academic journals",
-                            "Industry reports",
-                            "Government publications",
-                            "News outlets",
-                            "Expert interviews",
-                            "Case studies",
-                        ],
-                        value="News outlets",
-                        label="References",
-                        elem_classes="input-highlight-pink",
-                    )
-                    input_num_examples = gr.Dropdown(
-                        choices=["1-2", "3-4", "5+"],
-                        value="1-2",
-                        label="Number of Examples/Case Studies",
-                        elem_classes="input-highlight-yellow",
-                    )
-                    input_conclusion = gr.Dropdown(
-                        choices=["Summary", "Call to Action", "Future Outlook", "Thought-provoking Question"],
-                        value="Call to Action",
-                        label="Conclusion Type",
-                        elem_classes="input-highlight-turquoise",
                     )
-                    gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
-                    google_default = False
                     with gr.Row():
-                        google_search_check = gr.Checkbox(
-                            label="Enable Internet Search For Recent Sources", value=google_default
                         )
-                    with gr.Group(visible=google_default) as search_options:
-                        with gr.Row():
-                            include_sites = gr.Textbox(
-                                label="Include Specific Websites",
-                                placeholder="Enter comma-separated keywords",
-                                elem_classes="input-highlight-yellow",
-                            )
-                        with gr.Row():
-                            exclude_sites = gr.Textbox(
-                                label="Exclude Specific Websites",
-                                placeholder="Enter comma-separated keywords",
-                                elem_classes="input-highlight-yellow",
-                            )
-                        with gr.Row():
-                            domains_to_include = gr.Dropdown(
-                                domain_list,
-                                value=domain_list,
-                                multiselect=True,
-                                label="Domains To Include",
-                            )
-                        with gr.Row():
-                            month_from = gr.Dropdown(
-                                choices=months,
-                                label="From Month",
-                                value="January",
-                                interactive=True,
-                            )
-                            day_from = gr.Textbox(label="From Day", value="01")
-                            year_from = gr.Textbox(label="From Year", value="2000")
-                        with gr.Row():
-                            month_to = gr.Dropdown(
-                                choices=months,
-                                label="To Month",
-                                value=d1[1],
-                                interactive=True,
-                            )
-                            day_to = gr.Textbox(label="To Day", value=d1[0])
-                            year_to = gr.Textbox(label="To Year", value=d1[2])
-                    gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
-                    pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
                 """
-                # NOTE: HIDE AI MODEL SELECTION
-                with gr.Group():
-                    gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
-                    ai_generator = gr.Dropdown(
-                        choices=[
-                            "OpenAI GPT 4",
-                            "OpenAI GPT 4o",
-                            "OpenAI GPT 4o Mini",
-                            "Claude Sonnet 3.5",
-                            "Gemini 1.5 Pro",
-                            "LLaMA 3",
-                        ],
-                        value="OpenAI GPT 4o Mini",
-                        label="AI Model",
-                        elem_classes="input-highlight-pink",
-                    )
-                input_api = gr.Textbox(label="API Key", visible=False)
-                ai_generator.change(update_visibility_api, ai_generator, input_api)
                 """
-                generate_btn = gr.Button("Generate Article", variant="primary")
-            with gr.Column(scale=3):
-                with gr.Tab("Text Generator"):
-                    output_article = gr.Textbox(label="Generated Article", lines=20)
-                    ai_comments = gr.Textbox(
-                        label="Add comments to help edit generated text", interactive=True, visible=False
-                    )
-                    regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
-                    ai_detector_dropdown = gr.Radio(
-                        choices=ai_check_options, label="Select AI Detector", value="Polygraf AI"
-                    )
-                    ai_check_btn = gr.Button("AI Check")
-                    with gr.Accordion("AI Detection Results", open=True):
-                        ai_check_result = gr.Label(label="AI Check Result")
-                        mc_check_result = gr.Label(label="Creator Check Result")
-                        highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
-                    with gr.Accordion("Advanced Humanizer Settings", open=False):
-                        with gr.Row():
-                            model_dropdown = gr.Radio(
-                                choices=["Standard Model", "Advanced Model (Beta)"],
-                                value="Advanced Model (Beta)",
-                                label="Humanizer Model Version",
-                            )
-                        with gr.Row():
-                            temperature_slider = gr.Slider(
-                                minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Temperature"
-                            )
-                            top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
-                        with gr.Row():
-                            repetition_penalty_slider = gr.Slider(
-                                minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
-                            )
-                            length_penalty_slider = gr.Slider(
-                                minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty"
-                            )
-                    humanize_btn = gr.Button("Humanize")
-                    # humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
-                    # copy_to_input_btn = gr.Button("Copy to Input for AI Check")
-                with gr.Tab("History"):
-                    history_chat = gr.Chatbot(label="Generation History", height=1000)
-                    clear_history_btn = gr.Button("Clear History")
-                    clear_history_btn.click(clear_history, outputs=[history, history_chat])
-                    """
-                    # NOTE: REMOVED REFRESH BUTTON
-                    refresh_button = gr.Button("Refresh History")
-                    refresh_button.click(get_history, outputs=history_chat)
-                    """
-        def regenerate_visible(text):
-            if text:
-                return gr.update(visible=True)
-            else:
-                return gr.update(visible=False)
-        def highlight_visible(text):
-            if text.startswith("Polygraf"):
-                return gr.update(visible=True)
-            else:
-                return gr.update(visible=False)
-        def search_visible(toggle):
-            if toggle:
-                return gr.update(visible=True)
-            else:
-                return gr.update(visible=False)
-        google_search_check.change(search_visible, inputs=google_search_check, outputs=search_options)
-        ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
-        output_article.change(regenerate_visible, inputs=output_article, outputs=ai_comments)
-        ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
-        ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
-        # Update the default structure based on the selected format
-        # e.g. "Plain Text" for certain formats
-        input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
-        model_dropdown.change(fn=update_temperature, inputs=model_dropdown, outputs=temperature_slider)
-        generate_btn.click(
-            fn=generate_and_format,
-            inputs=[
-                input_role,
-                input_topic,
-                input_context,
-                input_keywords,
-                input_length,
-                input_format,
-                input_writing_style,
-                input_tone,
-                input_user_category,
-                input_depth,
-                input_structure,
-                input_references,
-                input_num_examples,
-                input_conclusion,
-                # ai_generator,
-                # input_api,
-                google_search_check,
-                year_from,
-                month_from,
-                day_from,
-                year_to,
-                month_to,
-                day_to,
-                domains_to_include,
-                include_sites,
-                exclude_sites,
-                pdf_file_input,
-                history,
-            ],
-            outputs=[output_article, history],
-        )
-        regenerate_btn.click(
-            fn=generate_and_format,
-            inputs=[
-                input_role,
-                input_topic,
-                input_context,
-                input_keywords,
-                input_length,
-                input_format,
-                input_writing_style,
-                input_tone,
-                input_user_category,
-                input_depth,
-                input_structure,
-                input_references,
-                input_num_examples,
-                input_conclusion,
-                # ai_generator,
-                # input_api,
-                google_search_check,
-                year_from,
-                month_from,
-                day_from,
-                year_to,
-                month_to,
-                day_to,
-                domains_to_include,
-                pdf_file_input,
-                history,
-                output_article,
-                include_sites,
-                exclude_sites,
-                ai_comments,
-            ],
-            outputs=[output_article, history],
-        )
-        ai_check_btn.click(
-            fn=ai_check,
-            inputs=[output_article, ai_detector_dropdown],
-            outputs=[ai_check_result, highlighted_text, mc_check_result],
-        )
-        humanize_btn.click(
-            fn=humanize,
-            inputs=[
-                output_article,
-                model_dropdown,
-                temperature_slider,
-                repetition_penalty_slider,
-                top_k_slider,
-                length_penalty_slider,
-                history,
-            ],
-            outputs=[output_article, history],
-        )
-        generate_btn.click(get_history, inputs=[history], outputs=[history_chat])
-        regenerate_btn.click(get_history, inputs=[history], outputs=[history_chat])
-        humanize_btn.click(get_history, inputs=[history], outputs=[history_chat])
-    return demo
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.queue(
-        max_size=2,
-        default_concurrency_limit=2,
-    ).launch(server_name="0.0.0.0", share=True, server_port=7890)
-    # demo.launch(server_name="0.0.0.0")

 export GOOGLE_APPLICATION_CREDENTIALS="gcp_creds.json"
 """
+import gc
 import re
+import uuid
+import json
 from typing import Dict
 from collections import defaultdict
 from datetime import date, datetime
 import nltk
 import torch
 import numpy as np
+import gradio as gr
 import language_tool_python
+from scipy.special import softmax
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from google.cloud import storage
+if gr.NO_RELOAD:
+    from humanize import humanize_text, device
+    from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
+    from google_search import google_search, months, domain_list, build_date
+    from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
+    nltk.download("punkt_tab")
+    print(f"Using device: {device}")
+    print("Loading AI detection models...")
+    models = {
+        "Polygraf AI (Base Model)": AutoModelForSequenceClassification.from_pretrained(
+            "polygraf-ai/bc-roberta-openai-2sent"
+        ).to(device),
+        "Polygraf AI (Advanced Model)": AutoModelForSequenceClassification.from_pretrained(
+            "polygraf-ai/bc_combined_3sent"
+        ).to(device),
+    }
+    tokenizers = {
+        "Polygraf AI (Base Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc-roberta-openai-2sent"),
+        "Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
+    }
+    # grammar correction tool
+    tool = language_tool_python.LanguageTool("en-US")
+    # source detection model
+    MC_TOKEN_SIZE = 256
+    TEXT_MC_MODEL_PATH = "polygraf-ai/mc-model"
+    MC_LABEL_MAP = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "Grammar Enhancer"]
+    text_mc_tokenizer = AutoTokenizer.from_pretrained(TEXT_MC_MODEL_PATH)
+    print("Loading Source detection model...")
+    text_mc_model = AutoModelForSequenceClassification.from_pretrained(TEXT_MC_MODEL_PATH).to(device)
+def generate_cited_html(cited_text, citations: dict):
+    cited_text = cited_text.replace("\n", "<br>")
+    html_code = """
+    <style>
+    .reference-container {
+        position: relative;
+        display: inline-block;
+    }
+    .reference-btn {
+        display: inline-block;
+        width: 20px; /* Reduced width */
+        height: 20px; /* Reduced height */
+        border-radius: 50%;
+        background-color: #e33a89; /* Pink color for the button */
+        color: white;
+        text-align: center;
+        line-height: 20px; /* Adjusted line-height */
+        cursor: pointer;
+        font-weight: bold;
+        margin-right: 5px;
+        transition: background-color 0.3s ease, transform 0.3s ease;
+    }
+    .reference-btn:hover {
+        background-color: #ff69b4; /* Lighter pink on hover */
+        transform: scale(1.1); /* Slightly enlarge on hover */
+    }
+    .reference-popup {
+        display: none;
+        position: absolute;
+        z-index: 1;
+        top: 100%;
+        background-color: #f9f9f9;
+        border: 1px solid #ddd;
+        padding: 15px;
+        border-radius: 4px;
+        box-shadow: 0 2px 5px rgba(0,0,0,0.2);
+        width: calc(min(90vw, 400px));
+        max-height: calc(min(80vh, 300px));
+        overflow-y: auto;
+    }
+    .reference-popup .close-btn {
+        float: right;
+        cursor: pointer;
+        font-weight: bold;
+        color: white;
+        font-size: 16px;
+        padding: 0;
+        width: 20px;
+        height: 20px;
+        text-align: center;
+        line-height: 20px;
+        background-color: #ff4c4c;
+        border-radius: 2px;
+        transition: transform 0.3s ease, background-color 0.3s ease;
+    }
+    .reference-popup .close-btn:hover {
+        transform: scale(1.2);
+        background-color: #ff3333;
+    }
+    input[type="radio"] {
+        position: absolute;
+        opacity: 0;
+        pointer-events: none;
+    }
+    input[type="radio"]:checked + .reference-popup {
+        display: block;
+    }
+    /* Additional styling for distinct sections */
+    .reference-popup strong {
+        font-weight: bold;
+        color: #333;
+        display: block;
+        margin-bottom: 5px;
+    }
+    .reference-popup p {
+        margin: 0 0 10px 0;
+        padding: 0;
+    }
+    .reference-popup .source {
+        margin-bottom: 10px;
+        font-size: 14px;
+        font-weight: bold;
+        color: #1e90ff;
+    }
+    .reference-popup .content {
+        margin-bottom: 10px;
+        font-size: 13px;
+        color: #555;
+    }
+    @media (prefers-color-scheme: dark) {
+        .reference-btn {
+            background-color: #1e90ff;
+        }
+        .reference-popup {
+            background-color: #2c2c2c;
+            border-color: #444;
+            color: #f1f1f1;
+        }
+        .reference-popup .close-btn {
+            background-color: #ff4c4c;
+        }
+        .reference-popup .close-btn:hover {
+            background-color: #ff3333;
+        }
+        .reference-popup strong {
+            color: #ddd;
+        }
+        .reference-popup .source {
+            color: #1e90ff;
+        }
+        .reference-popup .content {
+            color: #bbb;
+        }
+    }
+    </style>
+    <script>
+    document.addEventListener('click', (event) => {
+        const containers = document.querySelectorAll('.reference-container');
+        containers.forEach(container => {
+            const rect = container.getBoundingClientRect();
+            const popup = container.querySelector('.reference-popup');
+            // Reset alignment
+            popup.style.left = '';
+            popup.style.right = '';
+            const popupWidth = popup.offsetWidth;
+            const viewportWidth = window.innerWidth;
+            // If the popup would go off the right edge
+            if (rect.right + popupWidth > viewportWidth) {
+                popup.style.right = '0';  // Align popup to the right
+            }
+            // If the popup would go off the left edge
+            else if (rect.left - popupWidth < 0) {
+                popup.style.left = '0';  // Align popup to the left
+            }
+            // Otherwise center it
+            else {
+                popup.style.left = '50%';
+                popup.style.transform = 'translateX(-50%)'; // Center the popup
+            }
+        });
+    });
+    function closeReferencePanes() {
+        document.querySelectorAll('input[name="reference"]').forEach((input) => {
+            input.checked = false;
+        });
+    }
+    </script>
+    <div style="height: 600px; overflow-y: auto; overflow-x: auto;">
+    """
+    # Function to replace each citation with a reference button
+    citation_count = 0  # To track unique instances of each citation
+    def replace_citations(match):
+        nonlocal citation_count
+        citation_id = match.group(1)  # Extract citation number from the match
+        ref_data = citations.get(int(citation_id))
+        # If reference data is not found, return the original text
+        if not ref_data:
+            return match.group(0)
+        # Getting PDF file from gradio path
+        if "/var/tmp/gradio/" in ref_data["source"]:
+            ref_data["source"] = ref_data["source"].split("/")[-1]
+        # remove new line artifacts from scraping / parsing
+        ref_data["content"] = ref_data["content"].replace("\n", " ")
+        # Check if source is a URL, make it clickable if so
+        if ref_data["source"].startswith("http"):
+            source_html = f'<a href="{ref_data["source"]}" target="_blank" class="source">{ref_data["source"]}</a>'
+        else:
+            source_html = f'<span class="source">{ref_data["source"]}</span>'
+        # Unique id for each reference button and popup
+        unique_id = f"{citation_id}-{citation_count}"
+        citation_count += 1
+        # HTML code for the reference button and popup with formatted content
+        button_html = f"""
+        <span class="reference-container">
+        <label for="ref-toggle-{unique_id}" class="reference-btn" onclick="closeReferencePanes(); document.getElementById('ref-toggle-{unique_id}').checked = true;">{int(citation_id)+1}</label>
+        <input type="radio" id="ref-toggle-{unique_id}" name="reference" />
+        <span class="reference-popup">
+            <span class="close-btn" onclick="document.getElementById('ref-toggle-{unique_id}').checked = false;">&times;</span>
+            <strong>Source:</strong> {source_html}
+            <strong>Content:</strong> <p class="content">{ref_data["content"]}</p>
+        </span>
+        </span>
+        """
+        return button_html
+    # Replace inline citations in the text with the generated HTML
+    html_code += re.sub(r"<(\d+)>", replace_citations, cited_text)
+    html_code += "</div>"
+    return html_code
 # Function to move model to the appropriate device
         cleaned = re.sub(r"\s+", " ", paragraph).strip()
         cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
         cleaned_paragraphs.append(cleaned)
+    cleaned_paragraphs = [item for item in cleaned_paragraphs if item.strip()]
+    return "\n\n".join(cleaned_paragraphs)
 def format_references(text: str) -> str:
         output = model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
         output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
+        torch.cuda.empty_cache()
+        gc.collect()
         return output_norm
 ]
 def predict_mc(text):
     with torch.no_grad():
         text_mc_model.eval()
         ).to(device)
         output = text_mc_model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
+        torch.cuda.empty_cache()
+        gc.collect()
         return output_norm
 def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
+    text = remove_citations(text)
     body, references = split_text_from_refs(text)
     score, text = detection_polygraf(text=body, model=model)
     mc_score = predict_mc_scores(body, score)  # mc score
     return score, text, mc_score
+def ai_check(history: list, option: str):
+    text = history[-1][1]
     if option.startswith("Polygraf AI"):
         return highlighter_polygraf(text, option)
     else:
 def generate_prompt(settings: Dict[str, str]) -> str:
+    settings["keywords"] = [item for item in settings["keywords"] if item.strip()]
+    #    - Add a "References" section in the format "References:" on a new line after the requested text, formatted as [1], [2], etc. with each source on their own line
     prompt = f"""
+Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
+    """
+    if settings["context"]:
+        prompt += f"""
     Context:
     - {settings['context']}
+        """
+    prompt += f"""
     Style and Tone:
     - Writing style: {settings['writing_style']}
     - Tone: {settings['tone']}
     - Target audience: {settings['user_category']}
     Content:
     - Depth: {settings['depth_of_content']}
     - Structure: {', '.join(settings['structure'])}
+    """
+    if len(settings["keywords"]) > 0:
+        prompt += f"""
     Keywords to incorporate:
     {', '.join(settings['keywords'])}
+        """
+    prompt += f"""
     Additional requirements:
     - Don't start with "Here is a...", start with the requested text directly
     - End with a {settings['conclusion_type']} conclusion
     - Do not make any headline, title bold.
+    - Ensure proper paragraph breaks for better readability.
+    - Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
+    - Adhere to any format structure provided to the system if any.
     """
     return prompt
     Edit the given text based on user comments.
     User Comments:
     - {settings['user_comments']}
     Requirements:
     - Don't start with "Here is a...", start with the requested text directly
     - The original content should not be changed. Make minor modifications based on user comments above.
     - Do not make any headline, title bold.
     Context:
     - {settings['context']}
     Ensure proper paragraph breaks for better readability.
     Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
     """
         prompt = generate_prompt(settings)
     print("Generated Prompt...\n", prompt)
+    article, citations = generate(
         prompt=prompt,
+        input_role=input_role,
         topic=topic,
+        context=context,
         model=ai_model,
         url_content=url_content,
         path=pdf_file_input,
+        # path=["./final_report.pdf"], # TODO: reset
         temperature=1,
         max_length=2048,
         api_key=api_key,
         sys_message="",
     )
+    return article, citations
 def get_history(history):
+    # return history
+    history_formatted = []
+    for entry in history:
+        history_formatted.append((entry[0], entry[1]))
+    return history_formatted
 def clear_history():
 def humanize(
     model: str,
+    cited_text: str,
     temperature: float = 1.2,
     repetition_penalty: float = 1,
     top_k: int = 50,
     history=None,
 ) -> str:
     print("Humanizing text...")
+    # body, references = split_text_from_refs(text)
+    cited_text = history[-1][1]
+    citations = history[-1][2]
+    article = humanize_text(
+        text=cited_text,
         model_name=model,
         temperature=temperature,
         repetition_penalty=repetition_penalty,
         top_k=top_k,
         length_penalty=length_penalty,
     )
+    # result = result + references
+    # corrected_text = format_and_correct_language_check(result)
+    article = clean_text(article)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    history.append((f"Humanized Text | {timestamp}\nInput: {model}", article, citations))
+    latest_humanizer_data = {
+        "original text": cited_text,
+        "humanized text": article,
+        "citations": citations,  # can remove saving citations
+        "metadata": {
+            "temperature": temperature,
+            "repetition_penalty": repetition_penalty,
+            "top_k": top_k,
+            "length_penalty": length_penalty,
+        },
+        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    return generate_cited_html(article, citations), history, latest_humanizer_data
 def update_visibility_api(model: str):
         return gr.update(value=1.0, interactive=True)
 # Initialize Google Cloud Storage client
 client = storage.Client()
 bucket_name = "ai-source-detection"
     article,
     topic,
     input_role,
     context,
     keywords,
     article_length,
         "metadata": {
             "topic": topic,
             "input_role": input_role,
             "context": context,
             "keywords": keywords,
             "article_length": article_length,
     return f"Data saved as {file_name} in GCS."
+def save_humanizer_feedback_to_cloud_storage(data, humanizer_feedback):
+    """Save generated article and metadata to Google Cloud Storage within a specific folder."""
+    if data:
+        try:
+            data["user_feedback"] = humanizer_feedback
+            # Create a unique filename
+            file_id = str(uuid.uuid4())
+            # Define the file path and name in the bucket
+            folder_path = "ai-writer/humanizer-feedback/"
+            file_name = f"{folder_path}{data['timestamp'].replace(' ', '_').replace(':', '-')}_{file_id}.json"
+            # Convert data to JSON string
+            json_data = json.dumps(data)
+            # Create a blob and upload to GCS
+            blob = bucket.blob(file_name)
+            blob.upload_from_string(json_data, content_type="application/json")
+            gr.Info("Successfully reported. Thank you for the feedback!")
+        except Exception:
+            gr.Warning("Report not saved.")
+    else:
+        gr.Warning("Nothing humanized to save yet!")
 def generate_and_format(
     input_role,
     topic,
         date_from = build_date(year_from, month_from, day_from)
         date_to = build_date(year_to, month_to, day_to)
         sorted_date = f"date:r:{date_from}:{date_to}"
+        final_query = llm_wrapper(
+            input_role, topic, context, model="OpenAI GPT 4o", task_type="internet", temperature=0.7
+        )
         if include_sites:
             site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
             final_query += " " + " OR ".join(site_queries)
             final_query += " " + " ".join(exclude_queries)
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
+    # topic_context = topic + ", " + context
+    article, citations = generate_article(
         input_role,
+        topic,
         context,
         keywords,
         article_length,
         generated_article,
         user_comments,
     )
+    # if ends_with_references(article) and url_content is not None:
+    #     for url in url_content.keys():
+    #         article += f"\n{url}"
+    article = clean_text(display_cited_text(article))
+    # reference_formatted = format_references(article)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    history.append((f"Generated Text | {timestamp}\nInput: {topic}", article, citations))
     # Save the article and metadata to Cloud Storage
     # We dont save if there is PDF input for privacy reasons
             article,
             topic,
             input_role,
             context,
             keywords,
             article_length,
             timestamp,
         )
         print(save_message)
+    return generate_cited_html(article, citations), history
+# def create_interface():
+with gr.Blocks(
+    theme=gr.themes.Default(
+        primary_hue=gr.themes.colors.pink, secondary_hue=gr.themes.colors.yellow, neutral_hue=gr.themes.colors.gray
+    ),
+    css="""
+        .input-highlight-pink block_label {background-color: #008080}
+        """,
+) as demo:
+    history = gr.State([])
+    latest_humanizer_data = gr.State()
+    today = date.today()
+    # dd/mm/YY
+    d1 = today.strftime("%d/%B/%Y")
+    d1 = d1.split("/")
+    gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
+    with gr.Row():
+        with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4")
+                input_role = gr.Textbox(label="I am a", placeholder="Enter your role", value="Student")
+                input_topic = gr.Textbox(
+                    label="Topic",
+                    placeholder="Enter the main topic of your article",
+                    elem_classes="input-highlight-pink",
+                )
+                input_context = gr.Textbox(
+                    label="Context",
+                    placeholder="Provide some context for your topic",
+                    elem_classes="input-highlight-pink",
+                )
+                input_keywords = gr.Textbox(
+                    label="Keywords",
+                    placeholder="Enter comma-separated keywords",
+                    elem_classes="input-highlight-yellow",
+                )
+                with gr.Row():
+                    input_format = gr.Dropdown(
+                        choices=[
+                            "Article",
+                            "Essay",
+                            "Blog post",
+                            "Report",
+                            "Research paper",
+                            "News article",
+                            "White paper",
+                            "Email",
+                            "LinkedIn post",
+                            "X (Twitter) post",
+                            "Instagram Video Content",
+                            "TikTok Video Content",
+                            "Facebook post",
+                        ],
+                        value="Article",
+                        label="Format",
+                        elem_classes="input-highlight-turquoise",
                     )
+                input_length = gr.Slider(
+                    minimum=50,
+                    maximum=5000,
+                    step=50,
+                    value=300,
+                    label="Article Length",
+                    elem_classes="input-highlight-pink",
+                )
+                with gr.Row():
+                    input_writing_style = gr.Dropdown(
+                        choices=[
+                            "Formal",
+                            "Informal",
+                            "Technical",
+                            "Conversational",
+                            "Journalistic",
+                            "Academic",
+                            "Creative",
+                        ],
+                        value="Formal",
+                        label="Writing Style",
                         elem_classes="input-highlight-yellow",
                     )
+                    input_tone = gr.Dropdown(
+                        choices=["Friendly", "Professional", "Neutral", "Enthusiastic", "Skeptical", "Humorous"],
+                        value="Professional",
+                        label="Tone",
+                        elem_classes="input-highlight-turquoise",
+                    )
+                input_user_category = gr.Dropdown(
+                    choices=[
+                        "Students",
+                        "Professionals",
+                        "Researchers",
+                        "General Public",
+                        "Policymakers",
+                        "Entrepreneurs",
+                    ],
+                    value="General Public",
+                    label="Target Audience",
+                    elem_classes="input-highlight-pink",
+                )
+                input_depth = gr.Dropdown(
+                    choices=[
+                        "Surface-level overview",
+                        "Moderate analysis",
+                        "In-depth research",
+                        "Comprehensive study",
+                    ],
+                    value="Moderate analysis",
+                    label="Depth of Content",
+                    elem_classes="input-highlight-yellow",
+                )
+                input_structure = gr.Dropdown(
+                    choices=[
+                        "Introduction, Body, Conclusion",
+                        "Abstract, Introduction, Methods, Results, Discussion, Conclusion",
+                        "Executive Summary, Problem Statement, Analysis, Recommendations, Conclusion",
+                        "Introduction, Literature Review, Methodology, Findings, Analysis, Conclusion",
+                        "Plain Text",
+                    ],
+                    value="Introduction, Body, Conclusion",
+                    label="Structure",
+                    elem_classes="input-highlight-turquoise",
+                    interactive=True,
+                )
+                input_references = gr.Dropdown(
+                    choices=[
+                        "Academic journals",
+                        "Industry reports",
+                        "Government publications",
+                        "News outlets",
+                        "Expert interviews",
+                        "Case studies",
+                    ],
+                    value="News outlets",
+                    label="References",
+                    elem_classes="input-highlight-pink",
+                )
+                input_num_examples = gr.Dropdown(
+                    choices=["1-2", "3-4", "5+"],
+                    value="1-2",
+                    label="Number of Examples/Case Studies",
+                    elem_classes="input-highlight-yellow",
+                )
+                input_conclusion = gr.Dropdown(
+                    choices=["Summary", "Call to Action", "Future Outlook", "Thought-provoking Question"],
+                    value="Call to Action",
+                    label="Conclusion Type",
+                    elem_classes="input-highlight-turquoise",
+                )
+                gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
+                google_default = False
+                with gr.Row():
+                    google_search_check = gr.Checkbox(
+                        label="Enable Internet Search For Recent Sources", value=google_default
+                    )
+                with gr.Group(visible=google_default) as search_options:
                     with gr.Row():
+                        include_sites = gr.Textbox(
+                            label="Include Specific Websites",
+                            placeholder="Enter comma-separated keywords",
+                            elem_classes="input-highlight-yellow",
                         )
                     with gr.Row():
+                        exclude_sites = gr.Textbox(
+                            label="Exclude Specific Websites",
+                            placeholder="Enter comma-separated keywords",
                             elem_classes="input-highlight-yellow",
                         )
+                    with gr.Row():
+                        domains_to_include = gr.Dropdown(
+                            domain_list,
+                            value=domain_list,
+                            multiselect=True,
+                            label="Domains To Include",
+                        )
+                    with gr.Row():
+                        month_from = gr.Dropdown(
+                            choices=months,
+                            label="From Month",
+                            value="January",
+                            interactive=True,
                         )
+                        day_from = gr.Textbox(label="From Day", value="01")
+                        year_from = gr.Textbox(label="From Year", value="2000")
+                    with gr.Row():
+                        month_to = gr.Dropdown(
+                            choices=months,
+                            label="To Month",
+                            value=d1[1],
+                            interactive=True,
+                        )
+                        day_to = gr.Textbox(label="To Day", value=d1[0])
+                        year_to = gr.Textbox(label="To Year", value=d1[2])
+                gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
+                pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
+            """
+            # NOTE: HIDE AI MODEL SELECTION
+            with gr.Group():
+                gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
+                ai_generator = gr.Dropdown(
+                    choices=[
+                        "OpenAI GPT 4",
+                        "OpenAI GPT 4o",
+                        "OpenAI GPT 4o Mini",
+                        "Claude Sonnet 3.5",
+                        "Gemini 1.5 Pro",
+                        "LLaMA 3",
+                    ],
+                    value="OpenAI GPT 4o Mini",
+                    label="AI Model",
+                    elem_classes="input-highlight-pink",
+                )
+            input_api = gr.Textbox(label="API Key", visible=False)
+            ai_generator.change(update_visibility_api, ai_generator, input_api)
+            """
+            generate_btn = gr.Button("Generate Article", variant="primary")
+        with gr.Column(scale=2):
+            with gr.Tab("Text Generator"):
+                output_article = gr.HTML(
+                    value="""<div style="height: 600px;"></div>""",
+                    label="Generated Article",
+                )
+                with gr.Accordion("Regenerate Article", open=False):
+                    ai_comments = gr.Textbox(
+                        label="Add comments to help edit generated text", interactive=True, visible=True
                     )
+                    regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=True)
+                ai_detector_dropdown = gr.Dropdown(
+                    choices=ai_check_options, label="Select AI Detector", value="Polygraf AI (Base Model)"
+                )
+                ai_check_btn = gr.Button("AI Check")
+                with gr.Accordion("AI Detection Results", open=True):
+                    ai_check_result = gr.Label(label="AI Check Result")
+                    mc_check_result = gr.Label(label="Creator Check Result")
+                    highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
+                with gr.Accordion("Advanced Humanizer Settings", open=False):
+                    with gr.Row():
+                        model_dropdown = gr.Radio(
+                            choices=["Standard Model", "Advanced Model (Beta)"],
+                            value="Advanced Model (Beta)",
+                            label="Humanizer Model Version",
+                        )
                     with gr.Row():
+                        temperature_slider = gr.Slider(
+                            minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Temperature"
                         )
+                        top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
+                    with gr.Row():
+                        repetition_penalty_slider = gr.Slider(
+                            minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
+                        )
+                        length_penalty_slider = gr.Slider(
+                            minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty"
+                        )
+                humanize_btn = gr.Button("Humanize")
+                with gr.Row(equal_height=False):
+                    with gr.Column():
+                        humanizer_feedback = gr.Textbox(label="Add optional feedback on humanizer")
+                    with gr.Column():
+                        report_humanized_btn = gr.Button("Report Humanized Text", variant="primary", visible=True)
+                # humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
+                # copy_to_input_btn = gr.Button("Copy to Input for AI Check")
+            with gr.Tab("History"):
+                history_chat = gr.Chatbot(label="Generation History", height=1000)
+                clear_history_btn = gr.Button("Clear History")
+                clear_history_btn.click(clear_history, outputs=[history, history_chat])
                 """
+                # NOTE: REMOVED REFRESH BUTTON
+                refresh_button = gr.Button("Refresh History")
+                refresh_button.click(get_history, outputs=history_chat)
                 """
+    def regenerate_visible(text):
+        if text:
+            return gr.update(visible=True)
+        else:
+            return gr.update(visible=False)
+    def highlight_visible(text):
+        if text.startswith("Polygraf"):
+            return gr.update(visible=True)
+        else:
+            return gr.update(visible=False)
+    def search_visible(toggle):
+        if toggle:
+            return gr.update(visible=True)
+        else:
+            return gr.update(visible=False)
+    google_search_check.change(search_visible, inputs=google_search_check, outputs=search_options)
+    # ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
+    # output_article.change(regenerate_visible, inputs=output_article, outputs=ai_comments)
+    # ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
+    ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
+    # Update the default structure based on the selected format
+    # e.g. "Plain Text" for certain formats
+    input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
+    model_dropdown.change(fn=update_temperature, inputs=model_dropdown, outputs=temperature_slider)
+    report_humanized_btn.click(
+        save_humanizer_feedback_to_cloud_storage, inputs=[latest_humanizer_data, humanizer_feedback]
+    )
+    generate_btn.click(
+        fn=generate_and_format,
+        inputs=[
+            input_role,
+            input_topic,
+            input_context,
+            input_keywords,
+            input_length,
+            input_format,
+            input_writing_style,
+            input_tone,
+            input_user_category,
+            input_depth,
+            input_structure,
+            input_references,
+            input_num_examples,
+            input_conclusion,
+            # ai_generator,
+            # input_api,
+            google_search_check,
+            year_from,
+            month_from,
+            day_from,
+            year_to,
+            month_to,
+            day_to,
+            domains_to_include,
+            include_sites,
+            exclude_sites,
+            pdf_file_input,
+            history,
+        ],
+        outputs=[output_article, history],
+    )
+    regenerate_btn.click(
+        fn=generate_and_format,
+        inputs=[
+            input_role,
+            input_topic,
+            input_context,
+            input_keywords,
+            input_length,
+            input_format,
+            input_writing_style,
+            input_tone,
+            input_user_category,
+            input_depth,
+            input_structure,
+            input_references,
+            input_num_examples,
+            input_conclusion,
+            # ai_generator,
+            # input_api,
+            google_search_check,
+            year_from,
+            month_from,
+            day_from,
+            year_to,
+            month_to,
+            day_to,
+            domains_to_include,
+            pdf_file_input,
+            history,
+            output_article,
+            include_sites,
+            exclude_sites,
+            ai_comments,
+        ],
+        outputs=[output_article, history],
+    )
+    ai_check_btn.click(
+        fn=ai_check,
+        inputs=[history, ai_detector_dropdown],
+        outputs=[ai_check_result, highlighted_text, mc_check_result],
+    )
+    humanize_btn.click(
+        fn=humanize,
+        inputs=[
+            model_dropdown,
+            output_article,
+            temperature_slider,
+            repetition_penalty_slider,
+            top_k_slider,
+            length_penalty_slider,
+            history,
+        ],
+        outputs=[output_article, history, latest_humanizer_data],
+    )
+    generate_btn.click(get_history, inputs=[history], outputs=[history_chat])
+    regenerate_btn.click(get_history, inputs=[history], outputs=[history_chat])
+    humanize_btn.click(get_history, inputs=[history], outputs=[history_chat])
+# return demo
 if __name__ == "__main__":
+    # demo = create_interface()
+    # demo.queue(
+    #     max_size=2,
+    #     default_concurrency_limit=2,
+    # ).launch(server_name="0.0.0.0", share=True, server_port=7890)
+    demo.launch(server_name="0.0.0.0")

humanize.py CHANGED Viewed

@@ -3,8 +3,9 @@ import torch
 import nltk
 from nltk import sent_tokenize
 import gradio as gr
-from peft import PeftModel
 from transformers import T5ForConditionalGeneration, T5Tokenizer
 nltk.download("punkt")
@@ -49,7 +50,34 @@ FastLanguageModel.for_inference(dec_only_model)  # native 2x faster inference
 print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}")
-def humanize_batch_seq2seq(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
     inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
     inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
     outputs = model.generate(
@@ -65,7 +93,15 @@ def humanize_batch_seq2seq(model, tokenizer, sentences, temperature, repetition_
     return answers
-def humanize_batch_decoder_only(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
     pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
     # Construct the messages_batch using the tokenized sentences
     messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
@@ -73,7 +109,12 @@ def humanize_batch_decoder_only(model, tokenizer, sentences, temperature, repeti
     tokenizer = get_chat_template(
         tokenizer,
         chat_template="phi-3",
-        mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
     )
     # Enable native 2x faster inference
@@ -130,9 +171,11 @@ def humanize_text(
     Paragraphs are stored as a number of sentences per paragraph.
     """
     progress(0, desc="Starting to Humanize")
     # Map model names to their respective processing functions
-    model_map = {"Standard Model": humanize_batch_seq2seq, "Advanced Model (Beta)": humanize_batch_decoder_only}
     assert model_name in model_map, f"Invalid model name: {model_name}"
     process_function = model_map[model_name]
@@ -140,7 +183,10 @@ def humanize_text(
     paragraphs = text.split("\n")
     all_sentences = []
     sentences_per_paragraph = []
     for paragraph in paragraphs:
         sentences = sent_tokenize(paragraph)
         sentences_per_paragraph.append(len(sentences))
         all_sentences.extend(sentences)
@@ -156,8 +202,8 @@ def humanize_text(
             # Call the selected processing function
             paraphrased_batch = process_function(
-                seq2seq_model if model_name == "Standard Model" else dec_only_model,
-                seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer,
                 batch_sentences,
                 temperature,
                 repetition_penalty,
@@ -188,6 +234,8 @@ def humanize_text(
         humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
         humanized_paragraphs.append(humanized_paragraph)
         sentence_index += num_sentences
-    humanized_text = "\n".join(humanized_paragraphs)
     return humanized_text

 import nltk
 from nltk import sent_tokenize
 import gradio as gr
 from transformers import T5ForConditionalGeneration, T5Tokenizer
+import language_tool_python
+import re
 nltk.download("punkt")
 print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}")
+# grammar correction tool
+tool = language_tool_python.LanguageTool("en-US")
+def format_and_correct_language_check(text: str) -> str:
+    return tool.correct(text)
+def extract_citations(text):
+    citations = re.findall(r"<(\d+)>", text)
+    return [int(citation) for citation in citations]
+def remove_citations(text):
+    text = re.sub(r"<\d+>", "", text)
+    text = re.sub(r"[\d+]", "", text)
+    return text
+def humanize_batch_seq2seq(
+    model,
+    tokenizer,
+    sentences,
+    temperature,
+    repetition_penalty,
+    top_k,
+    length_penalty,
+):
     inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
     inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
     outputs = model.generate(
     return answers
+def humanize_batch_decoder_only(
+    model,
+    tokenizer,
+    sentences,
+    temperature,
+    repetition_penalty,
+    top_k,
+    length_penalty,
+):
     pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
     # Construct the messages_batch using the tokenized sentences
     messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
     tokenizer = get_chat_template(
         tokenizer,
         chat_template="phi-3",
+        mapping={
+            "role": "from",
+            "content": "value",
+            "user": "human",
+            "assistant": "gpt",
+        },  # ShareGPT style
     )
     # Enable native 2x faster inference
     Paragraphs are stored as a number of sentences per paragraph.
     """
     progress(0, desc="Starting to Humanize")
     # Map model names to their respective processing functions
+    model_map = {
+        "Standard Model": humanize_batch_seq2seq,
+        "Advanced Model (Beta)": humanize_batch_decoder_only,
+    }
     assert model_name in model_map, f"Invalid model name: {model_name}"
     process_function = model_map[model_name]
     paragraphs = text.split("\n")
     all_sentences = []
     sentences_per_paragraph = []
+    citations_per_paragraph = []
     for paragraph in paragraphs:
+        citations_per_paragraph.append(extract_citations(paragraph))
+        paragraph = remove_citations(paragraph)
         sentences = sent_tokenize(paragraph)
         sentences_per_paragraph.append(len(sentences))
         all_sentences.extend(sentences)
             # Call the selected processing function
             paraphrased_batch = process_function(
+                (seq2seq_model if model_name == "Standard Model" else dec_only_model),
+                (seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer),
                 batch_sentences,
                 temperature,
                 repetition_penalty,
         humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
         humanized_paragraphs.append(humanized_paragraph)
         sentence_index += num_sentences
+    for i, paragraph in enumerate(humanized_paragraphs):
+        citation_texts = [f"<{cid}>" for cid in citations_per_paragraph[i]]
+        humanized_paragraphs[i] = paragraph + " " + "".join(citation_texts)
+    humanized_text = "\n\n".join(humanized_paragraphs)
     return humanized_text

requirements.txt CHANGED Viewed

@@ -24,4 +24,5 @@ langchain-google-genai
 langchain-anthropic
 langchain-openai
 vertexai
-html2text

 langchain-anthropic
 langchain-openai
 vertexai
+html2text
+bm25s