|
import os |
|
import json |
|
from typing import List |
|
from fastapi import FastAPI, UploadFile, File, HTTPException |
|
from pydantic import BaseModel |
|
from langchain_groq import ChatGroq |
|
from langchain.document_loaders import PyPDFLoader |
|
|
|
|
|
API_KEY = os.getenv("GROQ_API_KEY") |
|
if not API_KEY: |
|
raise ValueError("GROQ_API_KEY environment variable not set.") |
|
|
|
app = FastAPI(title="PDF Question Extractor", version="1.0") |
|
|
|
|
|
class ExtractionResult(BaseModel): |
|
answers: List[str] |
|
|
|
|
|
def get_llm(): |
|
return ChatGroq( |
|
model="llama-3.3-70b-versatile", |
|
temperature=0, |
|
max_tokens=1024, |
|
api_key=API_KEY |
|
) |
|
|
|
llm = get_llm() |
|
|
|
@app.post("/extract-answers/") |
|
async def extract_answers(file: UploadFile = File(...)): |
|
try: |
|
|
|
file_path = f"./temp_{file.filename}" |
|
with open(file_path, "wb") as buffer: |
|
buffer.write(file.file.read()) |
|
|
|
|
|
loader = PyPDFLoader(file_path) |
|
pages = loader.load_and_split() |
|
all_page_content = "\n".join(page.page_content for page in pages) |
|
|
|
|
|
schema_dict = ExtractionResult.model_json_schema() |
|
schema = json.dumps(schema_dict, indent=2) |
|
|
|
|
|
system_message = ( |
|
"You are a document analysis tool that extracts the options and correct answers from the provided document content. " |
|
"The output must be a JSON object that strictly follows the schema: " + schema |
|
) |
|
|
|
|
|
user_message = ( |
|
"Please extract the correct answers and options (A, B, C, D, E) from the following document content:\n\n" |
|
+ all_page_content |
|
) |
|
|
|
|
|
prompt = system_message + "\n\n" + user_message |
|
|
|
|
|
response = llm.invoke(prompt, response_format={"type": "json_object"}) |
|
|
|
|
|
result = ExtractionResult.model_validate_json(response.content) |
|
|
|
|
|
os.remove(file_path) |
|
|
|
return result.model_dump() |
|
|
|
except Exception as e: |
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|