File size: 3,122 Bytes
e07e824
6687c9a
 
 
 
 
 
 
870d31e
 
 
 
e07e824
6687c9a
 
 
 
 
e07e824
6687c9a
 
 
 
 
 
 
 
 
 
 
 
 
417c147
6687c9a
417c147
 
 
 
 
 
6687c9a
 
 
 
 
ba7dd0b
 
6687c9a
ba7dd0b
 
 
 
6687c9a
 
 
ba7dd0b
6687c9a
 
 
f227fd1
 
 
6687c9a
 
 
 
 
f227fd1
 
 
 
 
6687c9a
 
 
 
 
 
06dcc43
6687c9a
 
 
 
 
 
f227fd1
6687c9a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
import spacy
from tqdm import tqdm
import gc
import os

# Download the SpaCy model
os.system("python -m spacy download en_core_web_lg")

# Load models
model_1 = tf.keras.models.load_model("model_1.h5")
model_2 = tf.keras.models.load_model("model_2.h5")
model_3 = tf.keras.models.load_model("model_3.h5")
model_4 = tf.keras.models.load_model("model_4.h5")

# Load dictionaries
with open('word_dict.pkl', 'rb') as f:
    word_dict = pickle.load(f)

with open('lemma_dict.pkl', 'rb') as f:
    lemma_dict = pickle.load(f)

# Load SpaCy NLP model
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)

def preprocess_text(text):
    """Preprocess the input text using SpaCy and return word indices."""
    docs = nlp.pipe([text], n_process=1)
    word_seq = []
    for doc in docs:
        for token in doc:
            if token.pos_ != "PUNCT":
                if token.text not in word_dict:
                    word_dict[token.text] = len(word_dict) + 1  # Increment index.
                word_seq.append(word_dict[token.text])
    return word_seq

def classify_question(text):
    # Preprocess the text
    seq = preprocess_text(text)
    padded_seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=55)  # Adjust maxlen if needed
    BATCH_SIZE = 512
    # Get predictions from each model
    pred1 = 0.15 * np.squeeze(model_1.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
    pred2 = 0.35 * np.squeeze(model_2.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
    pred3 = 0.15 * np.squeeze(model_3.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))
    pred4 = 0.35 * np.squeeze(model_4.predict(padded_seq, batch_size=BATCH_SIZE, verbose=2))

    # Combine predictions
    avg_pred = pred1 + pred2 + pred3 + pred4
    label = "Insincere" if avg_pred > 0.35 else "Sincere"

    # Create a list of probabilities for each model
    probs = {
        "Probability": float(avg_pred),
        "Model Probabilities": {"Model 1": float(pred1), "Model 2": float(pred2), "Model 3": float(pred3), "Model 4": float(pred4), "visible": False},
        "Sequence": {"value": seq, "visible": False}
    }

    return label, probs

# Example questions
examples = [
    "How do you train a pigeon to send messages?",
    "Is USA a shithole country owing to a shithole president?",
    "Why is Indian educationa total bullshit?",
    "Which person has given the least f**ks and still turned out successful?"
]

# Gradio Interface
interface = gr.Interface(
    fn=classify_question,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your question here..."),
    ],
    outputs=[
        "text",  # Output for label
        "json"   # Output for probabilities
    ],
    title="Quora Insincere Questions Classifier",
    examples=examples,
    description="Enter your question to classify it as sincere or insincere. Select an example question from the dropdown."
)

interface.launch()