Spaces:
Runtime error
Runtime error
File size: 10,839 Bytes
27b5d83 66b7053 27b5d83 66b7053 27b5d83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
from fasthtml_hf import setup_hf_backup
import io
import os
import traceback
from pydantic_core import from_json
from fasthtml.common import *
from PyPDF2 import PdfReader
from PyPDF2 import PdfReader
from langchain.chains.summarize import load_summarize_chain
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from pydantic import BaseModel, Field, ValidationError
from langchain.output_parsers import PydanticOutputParser
# Initialize the fastHtml application
app, rt = fast_app()
# Define Pydantic models for structured output
# SummaryLine represents a single summary item with its keywords and description
class SummaryLine(BaseModel):
summary_item: str = Field(description = "Actual summary sentence that contains highlighting key data points or information.",
max_length = 200)
keywords: List[str] = Field(description = "A list of exact words or phrases in the summary item that highlights most important data points or key ideas.")
brief_descripton_of_summary: str = Field(description = "This is elaborate description to provide context or background to the summary item.",
min_length = 200,
max_length = 500)
# TopicSummaries represents a collection of summaries for a specific topic
class TopicSummaries(BaseModel):
topic: str = Field(description = "Topics of summary as mentioned in the instructions.")
summaries: List[SummaryLine] = Field(description = "This a list summary for a topic with each one having it's own keywords and context.",
min_items=3,
max_items=5)
# CompleteSummary is the top-level model containing all topic summaries
class CompleteSummary(BaseModel):
summaries_list: List[TopicSummaries]
# Define the template for summarization
# This template provides instructions to the AI model on how to structure the summary
summarize_template = """
Write a concise summary of the case study given in the context. The summary should be based on the following topics.
"""
# Define the specific sections to be included in the summary
summary_sections = """
- Factual: Facts or information that contains numbers, dates, events etc. that are mostly quantitative or qualitative data
- SWOT: Key Strength, weakness, opportunities or threats that are mentioned in the case study
- Decisions and Outcomes: Key decisions taken and it's successful or failed outcomes and reasons
- Ethical and Governance: Key considerations from ethical and governance perspective
"""
# Define the context string for one-pass summarization
# This string provides additional formatting instructions for the summary
context_str = """
<context>
{context_content}
</context>
The response must follow the following schema strictly. There will be penalty for not following the schema.
"""
# Define the template for the reduce step in map-reduce summarization
# This template instructs the model to consolidate multiple summaries into a final summary
refine_str = """The following are set of summaries given in a markdown format:
{previous_summary}
Now add the above summary with more context given below and create final summary, which should contain the following sections.
"""
# Function to get the appropriate language model based on user selection
def getModel(model, key):
if(model == 'OpenAI'):
os.environ['OPENAI_API_KEY'] = key
return ChatOpenAI(temperature=0, # Set to 0 for deterministic output
model="gpt-4o", # Using the GPT-4 Turbo model
max_tokens=4096) # Limit the response length
else:
os.environ['ANTHROPIC_API_KEY'] = key
return ChatAnthropic(model='claude-3-5-sonnet-20240620') # Limit the response length
# Function to highlight specific keywords in the text
def highlight_text(text, key_words):
for word in key_words:
text = text.replace(word, f'<span style="color:red;"><b>{word}</b></span>')
html_text = "<div>" + text + "</div>"
return eval(html2ft(html_text))
# Function to generate an HTML table from the summary object
def generate_table(summaries_obj):
column_names = ['Topic', "Summary"]
table_header = Thead(Tr(*[Th(key) for key in column_names]))
table_rows = []
for topic_summary in summaries_obj.summaries_list:
first_row = True
for summary in topic_summary.summaries:
if(first_row):
table_rows.append(Tr(Td(topic_summary.topic,
rowspan=f"{len(topic_summary.summaries)}",
style = "width: 10%;"),
Td(highlight_text(summary.summary_item, summary.keywords),
style = "width: 60%;"),
Td(Div(Details(Summary( style = "summary::-webkit-details-marker { display: none }; list-style-type: '+'"),
P(summary.brief_descripton_of_summary)),
style ="padding: 0.5em 0.5em 0;"),
style = "width: 30%;")))
first_row = False
else:
table_rows.append(Tr(Td(highlight_text(summary.summary_item, summary.keywords),
style = f"width: 60%; rowspan='{len(topic_summary.summaries)}'"),
Td(Div(Details(Summary( style = "summary::-webkit-details-marker { display: none }; list-style-type: '+'"),
P(summary.brief_descripton_of_summary)),
style ="padding: 0.5em 0.5em 0;"),
style = "width: 30%;")))
return Div(Card(Table(table_header, Tbody(*table_rows))))
# Function to perform one-pass summarization on the given pages
def onepass_summarize(pages, summary_sections, model):
"""
Perform one-pass summarization on the given pages.
This function creates a summarization chain using the provided instructions
and model, then applies it to the input pages to generate a summary.
Args:
pages (list): List of pages (documents) to summarize
instructions (str): Custom instructions for summarization
model (ChatOpenAI): Instance of ChatOpenAI model to use for summarization
Returns:
str: Summarized text in markdown format
"""
onepass_summary_template = summarize_template + summary_sections + context_str + "{format_instructions}"
print("Onepass instruction: " + onepass_summary_template)
output_parser = PydanticOutputParser(pydantic_object=CompleteSummary)
format_instructions = output_parser.get_format_instructions()
print("Format instructions: " + format_instructions)
# Create a prompt template combining the instructions and context
prompt = PromptTemplate.from_template(onepass_summary_template)
# Create an LLM chain with the model and prompt
summary_chain = prompt | model | output_parser
print("Getting Summary......")
# Invoke the chain on the input pages and return the summarized text
summaries = summary_chain.invoke({"context_content": pages,
"format_instructions": format_instructions})
return summaries
# Function to generate the configuration form for the web interface
def getConfigForm():
return Card(Form(hx_post="/submit", hx_target="#result", hx_swap_oob="innerHTML", hx_indicator="#indicator")(
Div(
Label(Strong("Model and Prompt Instruction: "), style="color:#3498db; font-size:25px;")
),
Div(
Label(Strong('Model: ')),
Select(Option("OpenAI"), Option("Anthropic"), id="model")
),
Div(
Label(Strong('Secret Key: ')),
Input(id="secret", type="password", placeholder="Key: "),
),
Div(
Label(Strong('Upload File: '), "Upload only pdf file with max size of 1 MB"),
Input(id="file", type = 'file', placeholder="Key: ", accept = ".pdf", max = '1024000'),
),
Div(
Label(Strong('Instruction: ')),
P('Provide the list of topics and their one line description for summarization as shown in example. Summarization will have these sections.',
style = 'font-size: 12px;'),
Textarea(summary_sections, id="instruction",
style="height:250px")
),
Div(
Button("Summarize")
),
Div(
Br(),
A("Developed by Manaranjan Pradhan", href="http://www.manaranjanp.com/",
target="_blank",
style = 'color: red; font-size: 16px;')
)))
# Define the route for the homepage
@app.get('/')
def homepage():
return Titled('Document Summarization', Grid( getConfigForm(),
Div(
Div(Label(Strong('Summarizing the document.... take a deep breath....')),
Progress(), id="indicator", cls="htmx-indicator"),
Div(id="result", style ="font-family:Helvetica; font-size=24pt;")
)
, style="grid-template-columns: 400px 1000px; gap: 50px;"
))
# Define the route for form submission
@app.post('/submit')
async def post(d:dict):
try:
# Check if a file was uploaded
if "file" in d.keys():
pages = await d['file'].read(-1)
pdf_reader = PdfReader(io.BytesIO(pages))
else:
return Div("File not uploaded.", cls = 'alert', )
# Extract text from each page of the PDF
text_content = ""
for page in pdf_reader.pages:
text_content += page.extract_text() + "\n"
# Get the appropriate language model
model = getModel(d['model'], d['secret'])
# Perform one-pass summarization
summaries = onepass_summarize(text_content, d['instruction'], model)
print(f"Summary Obtained: {summaries}")
# Generate and return the HTML table with the summaries
return generate_table(summaries)
except BaseException as e:
print(traceback.format_exc())
return str(e)
setup_hf_backup(app)
# Start the FastAPI server
serve()
|