File size: 6,550 Bytes
3217c53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65ae4a5
3217c53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
from rank_bm25 import BM25Okapi
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import torch
import gradio as gr
from docx import Document
import pdfplumber

# Load the fine-tuned BERT-based QA model and tokenizer
model_name = "IProject-10/roberta-base-finetuned-squad2"  # Replace with your model name
qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the device for BERT
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qa_model.to(device)
qa_model.eval()

# Create a pipeline for retrieval-augmented QA
retrieval_qa_pipeline = pipeline(
    "question-answering",
    model=qa_model,
    tokenizer=tokenizer,
    device=device.index if torch.cuda.is_available() else -1
)

def extract_text_from_file(file):
    # Determine the file extension
    file_extension = os.path.splitext(file.name)[1].lower()
    text = ""

    try:
        if file_extension == ".txt":
            with open(file.name, "r") as f:
                text = f.read()
        elif file_extension == ".docx":
            doc = Document(file.name)
            for para in doc.paragraphs:
                text += para.text + "\n"
        elif file_extension == ".pdf":
            with pdfplumber.open(file.name) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() + "\n"
        else:
            raise ValueError("Unsupported file format: {}".format(file_extension))
    except Exception as e:
        text = str(e)
    return text

def load_passages(files):
    passages = []
    for file in files:
        passage = extract_text_from_file(file)
        passages.append(passage)
    return passages

def highlight_answer(context, answer):
    start_index = context.find(answer)
    if start_index != -1:
        end_index = start_index + len(answer)
        highlighted_context = f"{context[:start_index]}_________<<{context[start_index:end_index]}>>_________{context[end_index:]}"
        return highlighted_context
    else:
        return context

def answer_question(question, files):
    try:
        # Load passages from the uploaded files
        passages = load_passages(files)

        # Create an index using BM25
        bm25 = BM25Okapi([passage.split() for passage in passages])

        # Retrieve relevant passages using BM25
        tokenized_query = question.split()
        candidate_passages = bm25.get_top_n(tokenized_query, passages, n=3)
        bm25_scores = bm25.get_scores(tokenized_query)

        # Extract answer using the pipeline for each candidate passage
        answers_with_context = []
        for passage in candidate_passages:
            answer = retrieval_qa_pipeline(question=question, context=passage)
            bm25_score = bm25_scores[passages.index(passage)]
            answer_with_context = {
                "context": passage,
                "answer": answer["answer"],
                "BM25-score": bm25_score  # BM25 confidence score for this passage
            }
            answers_with_context.append(answer_with_context)

        # Choose the answer with the highest model confidence score
        best_answer = max(answers_with_context, key=lambda x: x["BM25-score"])

        # Highlight the answer in the context
        highlighted_context = highlight_answer(best_answer["context"], best_answer["answer"])

        return best_answer["answer"], highlighted_context, best_answer["BM25-score"]
    except Exception as e:
        return str(e), "", ""

# Description
md = """
### Brief Overview of the project:

A Document-Retrieval QA application built by training **[RoBERTa model](https://arxiv.org/pdf/1907.11692)** on **[SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/)** dataset for efficient answer extraction and
the system is augmented by using NLP based **[BM25](https://www.researchgate.net/publication/220613776_The_Probabilistic_Relevance_Framework_BM25_and_Beyond)** retriever for information retrieval from a large text corpus.
The project is a brief enhancement and augmentation to the work done in the research paper **Encoder-based LLMs: Building QA systems and Comparative Analysis**.
In this paper we study about BERT and its advanced variants and learn to build an efficient answer extraction QA system from scratch.
The built system can be used in information retrieval system and search engines.

**Objectives of the projects:**
1. Build a simple Answer Extraction QA system using **RoBERTa-base**: The project is deployed public url objective1.
2. Building a Information Retrieval system for data augmentation using **BM25**
3. **Document Retrieval QA** system by merging Answer Extraction QA system and Information retrieval system

### Demonstrating working of the Application:

<div style="text-align: center;">
    <img src="https://i.imgur.com/oYg8y7N.jpeg" alt="Description Image" style="border: 2px solid #000; border-radius: 5px; width: 600px; height: auto; display: block; margin: 0 auto;">
</div>

**Key Features:**
- Fine-tuned **RoBERTa**- Performs **Answer Extraction** from the retrieved document
- **BM25** Retriever- Performs **Information Retrieval** from the text corpus
- Provides answers with **highlighted context**.
- Application displays accurate **answer**, most relevant document **context** and the corresponding **BM25 score** of the passage to the user

**How to Use:**
1. Upload your corpus document(s).
2. Enter your question in the text box followed by a question mark(?).
3. Get the answer with context and corresponding BM25 scores.
"""

# Define Gradio interface
iface = gr.Interface(
    fn=answer_question,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your question here...", label="Question"),
        gr.Files(label="Upload text, Word, or PDF files")
    ],
    outputs=[
        gr.Textbox(label="Answer"),
        gr.Textbox(label="Context"),
        gr.Textbox(label="BM25 Score")
    ],
    title="Document Retrieval Question Answering Application",
    description=md,
    css="""
    .container { max-width: 800px; margin: auto; }
    .interface-title { font-family: Arial, sans-serif; font-size: 24px; font-weight: bold; }
    .interface-description { font-family: Arial, sans-serif; font-size: 16px; margin-bottom: 20px; }
    .input-textbox, .output-textbox { font-family: Arial, sans-serif; font-size: 14px; }
    .error { color: red; font-family: Arial, sans-serif; font-size: 14px; }
    """
)

# Launch the interface
iface.launch()