Spaces:

Hammad712
/

grading

Sleeping

File size: 2,763 Bytes

7f269b9
 
 
 
 
 
 
 
16aaeed
7f269b9
 
 
 
 
 
16aaeed
7f269b9
 
 
16aaeed
7f269b9
 
 
 
 
 
 
 
 
 
16aaeed
 
 
 
 
 
 
 
 
7f269b9
 
 
 
 
 
 
 
16aaeed
7f269b9
 
 
 
16aaeed
7f269b9
 
 
16aaeed
7f269b9
16aaeed
 
 
7f269b9
 
 
 
 
 
 
16aaeed
7f269b9
 
16aaeed
7f269b9
 
16aaeed
7f269b9

import os
import json
from typing import List
from fastapi import FastAPI, UploadFile, File, HTTPException
from pydantic import BaseModel
from langchain_groq import ChatGroq
from langchain.document_loaders import PyPDFLoader

# Securely load your Groq API key from environment variables
API_KEY = os.getenv("GROQ_API_KEY")
if not API_KEY:
    raise ValueError("GROQ_API_KEY environment variable not set.")

app = FastAPI(title="PDF Question Extractor", version="1.0")

# Define the expected JSON response schema
class ExtractionResult(BaseModel):
    answers: List[str]

# Initialize the language model (LLM)
def get_llm():
    return ChatGroq(
        model="llama-3.3-70b-versatile",
        temperature=0,
        max_tokens=1024,
        api_key=API_KEY
    )

llm = get_llm()

# Root endpoint: Provides a welcome message and instructions
@app.get("/")
async def root():
    return {
        "message": "Welcome to the PDF Question Extractor API.",
        "usage": "POST your PDF to /extract-answers/ to extract answers."
    }

# PDF extraction endpoint: Processes a PDF file upload
@app.post("/extract-answers/")
async def extract_answers(file: UploadFile = File(...)):
    try:
        # Save the uploaded file temporarily
        file_path = f"./temp_{file.filename}"
        with open(file_path, "wb") as buffer:
            buffer.write(file.file.read())

        # Load and split the PDF into pages
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()
        all_page_content = "\n".join(page.page_content for page in pages)

        # Generate the JSON schema from the Pydantic model
        schema_dict = ExtractionResult.model_json_schema()
        schema = json.dumps(schema_dict, indent=2)

        # Build the prompt with system and user messages
        system_message = (
            "You are a document analysis tool that extracts the options and correct answers "
            "from the provided document content. The output must be a JSON object that strictly follows the schema: " 
            + schema
        )
        user_message = (
            "Please extract the correct answers and options (A, B, C, D, E) from the following document content:\n\n"
            + all_page_content
        )
        prompt = system_message + "\n\n" + user_message

        # Invoke the LLM and request a JSON response
        response = llm.invoke(prompt, response_format={"type": "json_object"})

        # Validate and parse the JSON response using Pydantic
        result = ExtractionResult.model_validate_json(response.content)

        # Cleanup the temporary file
        os.remove(file_path)

        return result.model_dump()

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))