Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Aug 30, 2024

Commit

c1769c1

1 Parent(s): f6b1cb0

cleaned up output format + switch all records of text to new format

Browse files

Files changed (3) hide show

ai_generate.py +52 -14
app.py +21 -19
humanize.py +33 -0

ai_generate.py CHANGED Viewed

@@ -111,18 +111,43 @@ def remove_citations(text):
     return text
-def process_cited_text(data, docs):
-    # Initialize variables for the combined text and a dictionary for citations
     combined_text = ""
     citations = {}
     # Iterate through the cited_text list
     if 'cited_text' in data:
         for item in data['cited_text']:
-            chunk_text = item['chunk'][0]['text']
-            combined_text += chunk_text
             citation_ids = []
-            # Process the citations for the chunk
-            if item['chunk'][1]['citations']:
                 for c in item['chunk'][1]['citations']:
                     if c and 'citation' in c:
                         citation = c['citation']
@@ -133,16 +158,12 @@ def process_cited_text(data, docs):
                                 citation_ids.append(int(citation))
                             except ValueError:
                                 pass # Handle cases where the string is not a valid integer
-            if citation_ids:
-                citation_texts = [f"<{cid}>" for cid in citation_ids]
-                combined_text += " " + "".join(citation_texts)
-            combined_text += "\n\n"
             # Store unique citations in a dictionary
             for citation_id in citation_ids:
                 if citation_id not in citations:
                     citations[citation_id] = {'source': docs[citation_id].metadata['source'], 'content': docs[citation_id].page_content}
-    return combined_text.strip(), citations
 def citations_to_html(citations):
@@ -236,8 +257,8 @@ def generate_rag(
         | XMLOutputParser()
     )
     result = rag_chain.invoke({"input": prompt})
-    text, citations = process_cited_text(result, docs)
-    return text, citations
 def generate_base(
     prompt: str, topic: str, model: str, temperature: float, max_length: int, api_key: str, sys_message=""
@@ -248,7 +269,10 @@ def generate_base(
         return None, None
     try:
         output = llm.invoke(prompt).content
-        return output, None
     except Exception as e:
         print(f"An error occurred while running the model: {e}")
         return None, None
@@ -269,3 +293,17 @@ def generate(
         return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
         return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

     return text
+def display_cited_text(data):
     combined_text = ""
     citations = {}
     # Iterate through the cited_text list
     if 'cited_text' in data:
         for item in data['cited_text']:
+            if 'chunk' in item and len(item['chunk']) > 0:
+                chunk_text = item['chunk'][0].get('text')
+                combined_text += chunk_text
+                citation_ids = []
+                # Process the citations for the chunk
+                if len(item['chunk']) > 1 and item['chunk'][1]['citations']:
+                    for c in item['chunk'][1]['citations']:
+                        if c and 'citation' in c:
+                            citation = c['citation']
+                            if isinstance(citation, dict) and "source_id" in citation:
+                                citation = citation['source_id']
+                            if isinstance(citation, str):
+                                try:
+                                    citation_ids.append(int(citation))
+                                except ValueError:
+                                    pass # Handle cases where the string is not a valid integer
+            if citation_ids:
+                citation_texts = [f"<{cid}>" for cid in citation_ids]
+                combined_text += " " + "".join(citation_texts)
+            combined_text += "\n\n"
+    return combined_text
+def get_citations(data, docs):
+    # Initialize variables for the combined text and a dictionary for citations
+    citations = {}
+    # Iterate through the cited_text list
+    if data.get('cited_text'):
+        for item in data['cited_text']:
             citation_ids = []
+            if 'chunk' in item and len(item['chunk']) > 1 and item['chunk'][1].get('citations'):
                 for c in item['chunk'][1]['citations']:
                     if c and 'citation' in c:
                         citation = c['citation']
                                 citation_ids.append(int(citation))
                             except ValueError:
                                 pass # Handle cases where the string is not a valid integer
             # Store unique citations in a dictionary
             for citation_id in citation_ids:
                 if citation_id not in citations:
                     citations[citation_id] = {'source': docs[citation_id].metadata['source'], 'content': docs[citation_id].page_content}
+    return citations
 def citations_to_html(citations):
         | XMLOutputParser()
     )
     result = rag_chain.invoke({"input": prompt})
+    citations = get_citations(result, docs)
+    return result, citations
 def generate_base(
     prompt: str, topic: str, model: str, temperature: float, max_length: int, api_key: str, sys_message=""
         return None, None
     try:
         output = llm.invoke(prompt).content
+        output_dict = {'cited_text': [
+            {'chunk': [{'text': output}, {'citations': None}]}
+        ]}
+        return output_dict, None
     except Exception as e:
         print(f"An error occurred while running the model: {e}")
         return None, None
         return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
         return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
+# prompt = "Write a short 200 word report with an introduction about the current methods of ai detection and the results."
+# topic = "the current methods of ai detection"
+# text, citations = generate(
+#     prompt,
+#     topic,
+#     "OpenAI GPT 4o",
+#     None,
+#     ["./final_report.pdf","./detection_tools.pdf"],
+# )
+# from pprint import pprint
+# print(text)
+# print(citations)

app.py CHANGED Viewed

@@ -18,8 +18,8 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipe
 from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
 from google_search import google_search, months, domain_list, build_date
-from humanize import humanize_text, device
-from ai_generate import generate, citations_to_html, remove_citations
 import nltk
 nltk.download('punkt_tab')
@@ -380,11 +380,14 @@ def generate_article(
         api_key=api_key,
         sys_message="",
     )
-    return clean_text(article), citations_to_html(citations)
 def get_history(history):
-    return history
 def clear_history():
@@ -393,7 +396,6 @@ def clear_history():
 def humanize(
-    text: str,
     model: str,
     temperature: float = 1.2,
     repetition_penalty: float = 1,
@@ -402,21 +404,22 @@ def humanize(
     history=None,
 ) -> str:
     print("Humanizing text...")
-    body, references = split_text_from_refs(text)
-    result = humanize_text(
-        text=body,
         model_name=model,
         temperature=temperature,
         repetition_penalty=repetition_penalty,
         top_k=top_k,
         length_penalty=length_penalty,
     )
-    result = result + references
-    corrected_text = format_and_correct_language_check(result)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    history.append((f"Humanized Text | {timestamp}\nInput: {model}", corrected_text))
-    return corrected_text, history
 def update_visibility_api(model: str):
@@ -600,13 +603,13 @@ def generate_and_format(
         generated_article,
         user_comments,
     )
-    if ends_with_references(article) and url_content is not None:
-        for url in url_content.keys():
-            article += f"\n{url}"
-    reference_formatted = format_references(article)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    history.append((f"Generated Text | {timestamp}\nInput: {topic}", reference_formatted))
     # Save the article and metadata to Cloud Storage
     # We dont save if there is PDF input for privacy reasons
@@ -636,7 +639,7 @@ def generate_and_format(
         )
         print(save_message)
-    return reference_formatted, citations, history
 def create_interface():
@@ -1024,7 +1027,6 @@ def create_interface():
         humanize_btn.click(
             fn=humanize,
             inputs=[
-                output_article,
                 model_dropdown,
                 temperature_slider,
                 repetition_penalty_slider,

 from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
 from google_search import google_search, months, domain_list, build_date
+from humanize import humanize_text, device, humanize_chunk
+from ai_generate import generate, citations_to_html, remove_citations, display_cited_text
 import nltk
 nltk.download('punkt_tab')
         api_key=api_key,
         sys_message="",
     )
+    return article, citations_to_html(citations)
 def get_history(history):
+    history_formatted = []
+    for entry in history:
+        history_formatted.append((entry[0], display_cited_text(entry[1])))
+    return history_formatted
 def clear_history():
 def humanize(
     model: str,
     temperature: float = 1.2,
     repetition_penalty: float = 1,
     history=None,
 ) -> str:
     print("Humanizing text...")
+    # body, references = split_text_from_refs(text)
+    cited_text = history[-1][1]
+    result = humanize_chunk(
+        data = cited_text,
         model_name=model,
         temperature=temperature,
         repetition_penalty=repetition_penalty,
         top_k=top_k,
         length_penalty=length_penalty,
     )
+    # result = result + references
+    # corrected_text = format_and_correct_language_check(result)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    history.append((f"Humanized Text | {timestamp}\nInput: {model}", result))
+    return clean_text(display_cited_text(result)), history
 def update_visibility_api(model: str):
         generated_article,
         user_comments,
     )
+    # if ends_with_references(article) and url_content is not None:
+    #     for url in url_content.keys():
+    #         article += f"\n{url}"
+    # reference_formatted = format_references(article)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    history.append((f"Generated Text | {timestamp}\nInput: {topic}", article))
     # Save the article and metadata to Cloud Storage
     # We dont save if there is PDF input for privacy reasons
         )
         print(save_message)
+    return clean_text(display_cited_text(article)), citations, history
 def create_interface():
         humanize_btn.click(
             fn=humanize,
             inputs=[
                 model_dropdown,
                 temperature_slider,
                 repetition_penalty_slider,

humanize.py CHANGED Viewed

@@ -5,6 +5,7 @@ from nltk import sent_tokenize
 import gradio as gr
 from peft import PeftModel
 from transformers import T5ForConditionalGeneration, T5Tokenizer
 nltk.download("punkt")
@@ -49,6 +50,12 @@ FastLanguageModel.for_inference(dec_only_model)  # native 2x faster inference
 print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}")
 def humanize_batch_seq2seq(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
     inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
     inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
@@ -191,3 +198,29 @@ def humanize_text(
     humanized_text = "\n".join(humanized_paragraphs)
     return humanized_text

 import gradio as gr
 from peft import PeftModel
 from transformers import T5ForConditionalGeneration, T5Tokenizer
+import language_tool_python
 nltk.download("punkt")
 print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}")
+# grammar correction tool
+tool = language_tool_python.LanguageTool("en-US")
+def format_and_correct_language_check(text: str) -> str:
+    return tool.correct(text)
 def humanize_batch_seq2seq(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
     inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
     inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
     humanized_text = "\n".join(humanized_paragraphs)
     return humanized_text
+def humanize_chunk(
+    data,
+    progress=gr.Progress(),
+    model_name="Standard Model",
+    temperature=1.2,
+    repetition_penalty=1.0,
+    top_k=50,
+    length_penalty=1.0,
+):
+    humanized_chunks = {'cited_text': []}
+    if 'cited_text' in data:
+        for item in data['cited_text']:
+            humanized_chunk = {'chunk': [{'text': ""}, {'citations': None}]}
+            if 'chunk' in item and len(item['chunk']) > 0:
+                chunk_text = item['chunk'][0].get('text')
+                humanized_chunk['chunk'][0] = {'text': format_and_correct_language_check(humanize_text(chunk_text))}
+                citation_ids = []
+                # Process the citations for the chunk
+                if len(item['chunk']) > 1 and item['chunk'][1]['citations']:
+                    humanized_chunk['chunk'][1] = {'citations': item['chunk'][1]['citations']}
+            humanized_chunks['cited_text'].append(humanized_chunk)
+    return humanized_chunks