andreeabodea
/

Extract_Project_Report_Section_1

Model card Files Files and versions Community

andreeabodea commited on Apr 11, 2024

Commit

bb7fdc2

verified ·

1 Parent(s): 6dcfc3f

Update app.py

Browse files

Files changed (1) hide show

app.py +172 -116

app.py CHANGED Viewed

@@ -1,128 +1,184 @@
-import os
-import pdfplumber
-import re
 import gradio as gr
-from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
 from io import BytesIO
-import torch
-"""
-Extract the text from a section of a PDF file  between 'wanted_section' and 'next_section'.
-Parameters:
-- path (str): The file path to the PDF file.
-- wanted_section (str): The section to start extracting text from.
-- next_section (str): The section to stop extracting text at.
-Returns:
-- text (str): The extracted text from the specified section range.
-"""
-def get_section(path, wanted_section, next_section):
-    print(wanted_section)
-    # Open the PDF file
-    doc = pdfplumber.open(BytesIO(path))
-    start_page = []
-    end_page = []
-    # Find the all the pages for the specified sections
-    for page in range(len(doc.pages)):
-        if len(doc.pages[page].search(wanted_section, return_chars=False, case=False)) > 0:
-            start_page.append(page)
-        if len(doc.pages[page].search(next_section, return_chars=False, case=False)) > 0:
-            end_page.append(page)
-    # Extract the text between the start and end page of the wanted section
     text = []
-    for page_num in range(max(start_page), max(end_page)+1):
-        page = doc.pages[page_num]
-        text.append(page.extract_text())
-    text = " ".join(text)
-    final_text = text.replace("\n", " ")
-    return final_text
-def extract_between(big_string, start_string, end_string):
-    # Use a non-greedy match for content between start_string and end_string
-    pattern = re.escape(start_string) + '(.*?)' + re.escape(end_string)
-    match = re.search(pattern, big_string, re.DOTALL)
-    if match:
-        # Return the content without the start and end strings
-        return match.group(1)
-    else:
-        # Return None if the pattern is not found
-        return None
-def format_section1(section1_text):
-    result_section1_dict = {}
-    result_section1_dict['TOPIC'] = extract_between(section1_text, "Sektor", "EZ-Programm")
-    result_section1_dict['PROGRAM'] = extract_between(section1_text, "Sektor", "EZ-Programm")
-    result_section1_dict['PROJECT DESCRIPTION'] = extract_between(section1_text, "EZ-Programmziel", "Datum der letzten BE")
-    result_section1_dict['PROJECT NAME'] = extract_between(section1_text, "Modul", "Modulziel")
-    result_section1_dict['OBJECTIVE'] = extract_between(section1_text, "Modulziel", "Berichtszeitraum")
-    result_section1_dict['PROGRESS'] = extract_between(section1_text, "Zielerreichung des Moduls", "Massnahme im Zeitplan")
-    result_section1_dict['STATUS'] = extract_between(section1_text, "Massnahme im Zeitplan", "Risikoeinschätzung")
-    result_section1_dict['RECOMMENDATIONS'] = extract_between(section1_text, "Vorschläge zur Modulanpas-", "Voraussichtliche")
-    return result_section1_dict
-def answer_questions(text,language="de"):
-    # Initialize the zero-shot classification pipeline
-    model_name = "deepset/gelectra-large-germanquad"
-    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    # Initialize the QA pipeline
-    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
     questions = [
         "Welches ist das Titel des Moduls?",
         "Welches ist das Sektor oder das Kernthema?",
         "Welches ist das Land?",
-        "Zu welchem Program oder EZ-Programm gehort das Projekt?"
-        #"Welche Durchführungsorganisation aus den 4 Varianten 'giz', 'kfw', 'ptb' und 'bgr' implementiert das Projekt?"
-        # "In dem Dokument was steht bei Sektor?",
-        # "In dem Dokument was steht von 'EZ-Programm' bis 'EZ-Programmziel'?",
-        # "In dem Dokument was steht bei EZ-Programmziel?",
-        # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Modul?",
-        # "In dem Dokument was steht bei Zielerreichung des Moduls?",
-        # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Maßnahme im Zeitplan?",
-        # "In dem Dokument was steht bei Vorschläge zur Modulanpassung?",
-        # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als erstes Datum?",
-        # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als zweites Datum?"
     ]
-    # Iterate over each question and get answers
-    answers_dict = {}
-    for question in questions:
-        result = qa_pipeline(question=question, context=text)
-        # print(f"Question: {question}")
-        # print(f"Answer: {result['answer']}\n")
-        answers_dict[question] = result['answer']
-    return answers_dict
-def process_pdf(path):
-    results_dict = {}
-    results_dict["1. Kurzbeschreibung"] = \
-        get_section(path, "1. Kurzbeschreibung", "2. Einordnung des Moduls")
-    answers = answer_questions(results_dict["1. Kurzbeschreibung"])
     return answers
-def get_first_page_text(file_data):
-    doc = pdfplumber.open(BytesIO(file_data))
-    if len(doc.pages):
-        return doc.pages[0].extract_text()
 if __name__ == "__main__":
-    # Define the Gradio interface
-    # iface = gr.Interface(fn=process_pdf,
-    demo = gr.Interface(fn=process_pdf,
-                     inputs=gr.File(type="binary", label="Upload PDF"),
-                     outputs=gr.Textbox(label="Extracted Text"),
-                     title="PDF Text Extractor",
-                     description="Upload a PDF file to extract.")
-    demo.launch()

+# import os
+# import pdfplumber
+# import re
+# import gradio as gr
+# from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
+# from io import BytesIO
+# import torch
+# """
+# Extract the text from a section of a PDF file  between 'wanted_section' and 'next_section'.
+# Parameters:
+# - path (str): The file path to the PDF file.
+# - wanted_section (str): The section to start extracting text from.
+# - next_section (str): The section to stop extracting text at.
+# Returns:
+# - text (str): The extracted text from the specified section range.
+# """
+# def get_section(path, wanted_section, next_section):
+#     print(wanted_section)
+#     # Open the PDF file
+#     doc = pdfplumber.open(BytesIO(path))
+#     start_page = []
+#     end_page = []
+#     # Find the all the pages for the specified sections
+#     for page in range(len(doc.pages)):
+#         if len(doc.pages[page].search(wanted_section, return_chars=False, case=False)) > 0:
+#             start_page.append(page)
+#         if len(doc.pages[page].search(next_section, return_chars=False, case=False)) > 0:
+#             end_page.append(page)
+#     # Extract the text between the start and end page of the wanted section
+#     text = []
+#     for page_num in range(max(start_page), max(end_page)+1):
+#         page = doc.pages[page_num]
+#         text.append(page.extract_text())
+#     text = " ".join(text)
+#     final_text = text.replace("\n", " ")
+#     return final_text
+# def extract_between(big_string, start_string, end_string):
+#     # Use a non-greedy match for content between start_string and end_string
+#     pattern = re.escape(start_string) + '(.*?)' + re.escape(end_string)
+#     match = re.search(pattern, big_string, re.DOTALL)
+#     if match:
+#         # Return the content without the start and end strings
+#         return match.group(1)
+#     else:
+#         # Return None if the pattern is not found
+#         return None
+# def format_section1(section1_text):
+#     result_section1_dict = {}
+#     result_section1_dict['TOPIC'] = extract_between(section1_text, "Sektor", "EZ-Programm")
+#     result_section1_dict['PROGRAM'] = extract_between(section1_text, "Sektor", "EZ-Programm")
+#     result_section1_dict['PROJECT DESCRIPTION'] = extract_between(section1_text, "EZ-Programmziel", "Datum der letzten BE")
+#     result_section1_dict['PROJECT NAME'] = extract_between(section1_text, "Modul", "Modulziel")
+#     result_section1_dict['OBJECTIVE'] = extract_between(section1_text, "Modulziel", "Berichtszeitraum")
+#     result_section1_dict['PROGRESS'] = extract_between(section1_text, "Zielerreichung des Moduls", "Massnahme im Zeitplan")
+#     result_section1_dict['STATUS'] = extract_between(section1_text, "Massnahme im Zeitplan", "Risikoeinschätzung")
+#     result_section1_dict['RECOMMENDATIONS'] = extract_between(section1_text, "Vorschläge zur Modulanpas-", "Voraussichtliche")
+#     return result_section1_dict
+# def answer_questions(text,language="de"):
+#     # Initialize the zero-shot classification pipeline
+#     model_name = "deepset/gelectra-large-germanquad"
+#     model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+#     tokenizer = AutoTokenizer.from_pretrained(model_name)
+#     # Initialize the QA pipeline
+#     qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
+#     questions = [
+#         "Welches ist das Titel des Moduls?",
+#         "Welches ist das Sektor oder das Kernthema?",
+#         "Welches ist das Land?",
+#         "Zu welchem Program oder EZ-Programm gehort das Projekt?"
+#         #"Welche Durchführungsorganisation aus den 4 Varianten 'giz', 'kfw', 'ptb' und 'bgr' implementiert das Projekt?"
+#         # "In dem Dokument was steht bei Sektor?",
+#         # "In dem Dokument was steht von 'EZ-Programm' bis 'EZ-Programmziel'?",
+#         # "In dem Dokument was steht bei EZ-Programmziel?",
+#         # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Modul?",
+#         # "In dem Dokument was steht bei Zielerreichung des Moduls?",
+#         # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Maßnahme im Zeitplan?",
+#         # "In dem Dokument was steht bei Vorschläge zur Modulanpassung?",
+#         # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als erstes Datum?",
+#         # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als zweites Datum?"
+#     ]
+#     # Iterate over each question and get answers
+#     answers_dict = {}
+#     for question in questions:
+#         result = qa_pipeline(question=question, context=text)
+#         # print(f"Question: {question}")
+#         # print(f"Answer: {result['answer']}\n")
+#         answers_dict[question] = result['answer']
+#     return answers_dict
+# def process_pdf(path):
+#     results_dict = {}
+#     results_dict["1. Kurzbeschreibung"] = \
+#         get_section(path, "1. Kurzbeschreibung", "2. Einordnung des Moduls")
+#     answers = answer_questions(results_dict["1. Kurzbeschreibung"])
+#     return answers
+# def get_first_page_text(file_data):
+#     doc = pdfplumber.open(BytesIO(file_data))
+#     if len(doc.pages):
+#         return doc.pages[0].extract_text()
+# if __name__ == "__main__":
+#     # Define the Gradio interface
+#     # iface = gr.Interface(fn=process_pdf,
+#     # demo = gr.Interface(fn=process_pdf,
+#     #                  inputs=gr.File(type="binary", label="Upload PDF"),
+#     #                  outputs=gr.Textbox(label="Extracted Text"),
+#     #                  title="PDF Text Extractor",
+#     #                  description="Upload a PDF file to extract.")
+#     # demo.launch()
+#      demo = gr.Interface(fn=process_pdf,
+#                      inputs=gr.File(type="pdf"),
+#                      outputs="text,
+#                      title="PDF Text Extractor",
+#                      description="Upload a PDF file to extract.")
+#     demo.launch()
 import gradio as gr
+import pdfplumber
+from transformers import pipeline
 from io import BytesIO
+import re
+# Initialize the question-answering pipeline with a specific pre-trained model
+qa_pipeline = pipeline("question-answering", model="deepset/gelectra-large-germanquad")
+def extract_text_from_pdf(file_obj):
+    """Extracts text from a PDF file."""
     text = []
+    with pdfplumber.open(file_obj) as pdf:
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:  # Make sure there's text on the page
+                text.append(page_text)
+    return " ".join(text)
+def answer_questions(context):
+    """Generates answers to predefined questions based on the provided context."""
     questions = [
         "Welches ist das Titel des Moduls?",
         "Welches ist das Sektor oder das Kernthema?",
         "Welches ist das Land?",
+        "Zu welchem Program oder EZ-Programm gehört das Projekt?"
     ]
+    answers = {q: qa_pipeline(question=q, context=context)['answer'] for q in questions}
     return answers
+def process_pdf(file):
+    """Process a PDF file to extract text and then use the text to answer questions."""
+    # Read the PDF file from Gradio's file input, which is a temporary file path
+    with file as file_path:
+        text = extract_text_from_pdf(BytesIO(file_path.read()))
+        results = answer_questions(text)
+        return "\n".join(f"{q}: {a}" for q, a in results.items())
+# Define the Gradio interface
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=gr.inputs.File(type="pdf", label="Upload your PDF file"),
+    outputs=gr.outputs.Textbox(label="Extracted Information and Answers"),
+    title="PDF Text Extractor and Question Answerer",
+    description="Upload a PDF file to extract text and answer predefined questions based on the content."
+)
 if __name__ == "__main__":
+    iface.launch()