File size: 3,414 Bytes
5584c5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import streamlit as st
import torch
from transformers import GPT2Tokenizer, LlamaForSequenceClassification
import fitz  # PyMuPDF for extracting text from PDFs
import io
from torch.utils.data import Dataset
from sklearn.metrics import classification_report

# Load the tokenizer and model
model_path = "model"
tokenizer = GPT2Tokenizer.from_pretrained(model_path, local_files_only=True)
model = LlamaForSequenceClassification.from_pretrained(model_path, local_files_only=True)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
    # Read the PDF file as a binary stream
    pdf_bytes = pdf_file.read()
    
    # Using BytesIO to convert the binary data into a file-like object
    pdf_stream = io.BytesIO(pdf_bytes)
    
    # Open the PDF using PyMuPDF from the file-like object
    doc = fitz.open(stream=pdf_stream, filetype="pdf")
    
    text = ""
    for page in doc:
        text += page.get_text("text")
    
    return text

# Function to preprocess and tokenize the input text
def preprocess_text(text1, text2):
    inputs = tokenizer(
        text1, text2,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    return inputs

# Dataset class (similar to your existing one)
class PlagiarismDataset(Dataset):
    def __init__(self, text1, text2, tokenizer):
        self.text1 = text1
        self.text2 = text2
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text1)

    def __getitem__(self, idx):
        inputs = preprocess_text(self.text1[idx], self.text2[idx])
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }

# Function to detect plagiarism using the model
def detect_plagiarism(text1, text2):
    dataset = PlagiarismDataset(text1, text2, tokenizer)
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)

    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            predictions.append(preds.item())

    return predictions[0]

# Streamlit UI
st.title("Plagiarism Detection using LLM")
st.write("Upload two PDFs for plagiarism detection.")

# Upload PDFs
pdf_file1 = st.file_uploader("Upload the first PDF", type="pdf")
pdf_file2 = st.file_uploader("Upload the second PDF", type="pdf")

if pdf_file1 and pdf_file2:
    # Extract text from PDFs
    text1 = extract_text_from_pdf(pdf_file1)
    text2 = extract_text_from_pdf(pdf_file2)

    # Display some text from the PDFs for context
    st.subheader("Text from the first document:")
    st.text(text1[:1000])  # Display the first 1000 characters of the document
    st.subheader("Text from the second document:")
    st.text(text2[:1000])

    # Detect plagiarism
    result = detect_plagiarism([text1], [text2])

    # Display the result
    if result == 1:
        st.success("Plagiarism detected!")
    else:
        st.success("No plagiarism detected.")