Spaces:
Running
Running
added full code
Browse files
app.py
CHANGED
@@ -8,9 +8,22 @@ import tempfile
|
|
8 |
import base64
|
9 |
import dotenv
|
10 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
load_dotenv()
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
# Previous functions from Question Generator
|
15 |
def get_pdf_path(pdf_source=None, uploaded_file=None):
|
16 |
try:
|
@@ -183,63 +196,23 @@ def process_pdf_and_generate_questions(pdf_source, uploaded_file, api_key, role_
|
|
183 |
st.error(f"Error processing PDF and generating questions: {e}")
|
184 |
return []
|
185 |
|
186 |
-
dummydata = [
|
187 |
-
{"question": "What is the main idea of the paper?", "score": {
|
188 |
-
"Knowledge": 10,
|
189 |
-
"Comprehension": 9,
|
190 |
-
"Application": 8,
|
191 |
-
"Analysis": 7,
|
192 |
-
"Synthesis": 6,
|
193 |
-
"Evaluation": 5
|
194 |
-
}},
|
195 |
-
{"question": "What are the key findings of the paper?", "score": {
|
196 |
-
"Knowledge": 9,
|
197 |
-
"Comprehension": 8,
|
198 |
-
"Application": 7,
|
199 |
-
"Analysis": 6,
|
200 |
-
"Synthesis": 5,
|
201 |
-
"Evaluation": 4
|
202 |
-
}},
|
203 |
-
{"question": "How does the paper contribute to the field?", "score": {
|
204 |
-
"Knowledge": 8,
|
205 |
-
"Comprehension": 7,
|
206 |
-
"Application": 6,
|
207 |
-
"Analysis": 5,
|
208 |
-
"Synthesis": 4,
|
209 |
-
"Evaluation": 3
|
210 |
-
}},
|
211 |
-
{"question": "What are the limitations of the paper?", "score": {
|
212 |
-
"Knowledge": 7,
|
213 |
-
"Comprehension": 6,
|
214 |
-
"Application": 5,
|
215 |
-
"Analysis": 4,
|
216 |
-
"Synthesis": 3,
|
217 |
-
"Evaluation": 2
|
218 |
-
}},
|
219 |
-
{"question": "What are the future research directions?", "score": {
|
220 |
-
"Knowledge": 6,
|
221 |
-
"Comprehension": 5,
|
222 |
-
"Application": 4,
|
223 |
-
"Analysis": 3,
|
224 |
-
"Synthesis": 2,
|
225 |
-
"Evaluation": 1
|
226 |
-
}},
|
227 |
-
{"question": "How does the paper compare to existing work?", "score": {
|
228 |
-
"Knowledge": 5,
|
229 |
-
"Comprehension": 4,
|
230 |
-
"Application": 3,
|
231 |
-
"Analysis": 2,
|
232 |
-
"Synthesis": 1,
|
233 |
-
"Evaluation": 0
|
234 |
-
}
|
235 |
-
}
|
236 |
-
|
237 |
-
]
|
238 |
-
|
239 |
def main():
|
240 |
st.set_page_config(page_title="Academic Paper Tool", page_icon="π", layout="wide")
|
241 |
-
|
242 |
# Tabs for different functionalities
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
tab1, tab2 = st.tabs(["Question Generator", "Paper Scorer"])
|
244 |
|
245 |
if 'totalscore' not in st.session_state:
|
@@ -275,6 +248,7 @@ def main():
|
|
275 |
|
276 |
st.session_state.pdf_url = st.text_input(
|
277 |
"Enter the URL of the PDF",
|
|
|
278 |
key="pdf_url_input"
|
279 |
)
|
280 |
|
@@ -445,7 +419,6 @@ def main():
|
|
445 |
type=['pdf','jpg','png','jpeg'],
|
446 |
label_visibility="collapsed"
|
447 |
)
|
448 |
-
|
449 |
# Custom submit button with some styling
|
450 |
submit_button = st.form_submit_button(
|
451 |
"Score Paper",
|
@@ -455,17 +428,33 @@ def main():
|
|
455 |
|
456 |
if submit_button:
|
457 |
# Calculate total score
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
|
|
|
|
|
|
|
|
463 |
|
464 |
# Score display columns
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
with st.expander("Show Detailed Scores", expanded=True):
|
471 |
for idx, item in enumerate(dummydata, 1):
|
@@ -477,13 +466,13 @@ def main():
|
|
477 |
score_cols = st.columns(6)
|
478 |
|
479 |
# Scoring categories
|
480 |
-
categories = ['
|
481 |
|
482 |
for col, category in zip(score_cols, categories):
|
483 |
with col:
|
484 |
# Determine color based on score
|
485 |
-
score = item['score'][category]
|
486 |
-
color = 'green' if score > 7 else 'orange' if score > 4 else 'red'
|
487 |
|
488 |
st.markdown(f"""
|
489 |
<div style="text-align: center;
|
@@ -492,7 +481,7 @@ def main():
|
|
492 |
padding: 5px;
|
493 |
margin-bottom: 5px;">
|
494 |
<div style="font-weight: bold; color: {color};">{category}</div>
|
495 |
-
<div style="font-size: 18px; color: {color};">{score}/
|
496 |
</div>
|
497 |
""", unsafe_allow_html=True)
|
498 |
|
@@ -501,12 +490,83 @@ def main():
|
|
501 |
# Add a separator between questions
|
502 |
if idx < len(dummydata):
|
503 |
st.markdown('---')
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
|
511 |
# Run Streamlit app
|
512 |
if __name__ == "__main__":
|
|
|
8 |
import base64
|
9 |
import dotenv
|
10 |
from dotenv import load_dotenv
|
11 |
+
import torch
|
12 |
+
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
|
13 |
+
from torch.nn.functional import softmax
|
14 |
+
from doctr.models import ocr_predictor
|
15 |
+
from doctr.io import DocumentFile
|
16 |
|
17 |
load_dotenv()
|
18 |
|
19 |
+
model = DistilBertForSequenceClassification.from_pretrained('./fine_tuned_distilbert')
|
20 |
+
tokenizer = DistilBertTokenizer.from_pretrained('./fine_tuned_distilbert')
|
21 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
22 |
+
model.to(device)
|
23 |
+
mapping = {"Remembering": 0, "Understanding": 1, "Applying": 2, "Analyzing": 3, "Evaluating": 4, "Creating": 5}
|
24 |
+
reverse_mapping = {v: k for k, v in mapping.items()}
|
25 |
+
modelocr = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
|
26 |
+
|
27 |
# Previous functions from Question Generator
|
28 |
def get_pdf_path(pdf_source=None, uploaded_file=None):
|
29 |
try:
|
|
|
196 |
st.error(f"Error processing PDF and generating questions: {e}")
|
197 |
return []
|
198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
def main():
|
200 |
st.set_page_config(page_title="Academic Paper Tool", page_icon="π", layout="wide")
|
|
|
201 |
# Tabs for different functionalities
|
202 |
+
st.markdown("""
|
203 |
+
<style>
|
204 |
+
.stTabs [data-baseweb="tab"] {
|
205 |
+
margin-bottom: 1rem;
|
206 |
+
flex: 1;
|
207 |
+
justify-content: center;
|
208 |
+
}
|
209 |
+
.stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
|
210 |
+
font-size:2rem;
|
211 |
+
padding: 0 2rem;
|
212 |
+
margin: 0;
|
213 |
+
}
|
214 |
+
</style>
|
215 |
+
""", unsafe_allow_html=True)
|
216 |
tab1, tab2 = st.tabs(["Question Generator", "Paper Scorer"])
|
217 |
|
218 |
if 'totalscore' not in st.session_state:
|
|
|
248 |
|
249 |
st.session_state.pdf_url = st.text_input(
|
250 |
"Enter the URL of the PDF",
|
251 |
+
value=st.session_state.pdf_url,
|
252 |
key="pdf_url_input"
|
253 |
)
|
254 |
|
|
|
419 |
type=['pdf','jpg','png','jpeg'],
|
420 |
label_visibility="collapsed"
|
421 |
)
|
|
|
422 |
# Custom submit button with some styling
|
423 |
submit_button = st.form_submit_button(
|
424 |
"Score Paper",
|
|
|
428 |
|
429 |
if submit_button:
|
430 |
# Calculate total score
|
431 |
+
print(uploaded_file.name)
|
432 |
+
dummydata = sendtogemini(uploaded_file.name)
|
433 |
+
print(dummydata)
|
434 |
+
total_score = {'Remembering': 0, 'Understanding': 0, 'Applying': 0, 'Analyzing': 0, 'Evaluating': 0, 'Creating': 0}
|
435 |
+
for item in dummydata:
|
436 |
+
for category in total_score:
|
437 |
+
total_score[category] += item['score'][category]
|
438 |
+
|
439 |
+
# average_score = total_score / (len(dummydata) * 6 * 10) * 100
|
440 |
|
441 |
# Score display columns
|
442 |
+
categories = ['Remembering', 'Understanding', 'Applying', 'Analyzing', 'Evaluating', 'Creating']
|
443 |
+
|
444 |
+
# Create 6 columns in a single row
|
445 |
+
cols = st.columns(6)
|
446 |
+
|
447 |
+
# Iterate through categories and populate columns
|
448 |
+
for i, category in enumerate(categories):
|
449 |
+
with cols[i]:
|
450 |
+
score = round(total_score[category] / (len(dummydata) ),ndigits=3)
|
451 |
+
color = 'green' if score > .7 else 'orange' if score > .4 else 'red'
|
452 |
+
st.markdown(f"""
|
453 |
+
<div class="score-breakdown">
|
454 |
+
<div class="score-header" style="color: {color}">{category}</div>
|
455 |
+
<div style="font-size: 24px; color: {color};">{score}/1</div>
|
456 |
+
</div>
|
457 |
+
""", unsafe_allow_html=True)
|
458 |
|
459 |
with st.expander("Show Detailed Scores", expanded=True):
|
460 |
for idx, item in enumerate(dummydata, 1):
|
|
|
466 |
score_cols = st.columns(6)
|
467 |
|
468 |
# Scoring categories
|
469 |
+
categories = ['Remembering', 'Understanding', 'Applying', 'Analyzing', 'Evaluating', 'Creating']
|
470 |
|
471 |
for col, category in zip(score_cols, categories):
|
472 |
with col:
|
473 |
# Determine color based on score
|
474 |
+
score = round(item['score'][category],ndigits=3)
|
475 |
+
color = 'green' if score > .7 else 'orange' if score > .4 else 'red'
|
476 |
|
477 |
st.markdown(f"""
|
478 |
<div style="text-align: center;
|
|
|
481 |
padding: 5px;
|
482 |
margin-bottom: 5px;">
|
483 |
<div style="font-weight: bold; color: {color};">{category}</div>
|
484 |
+
<div style="font-size: 18px; color: {color};">{score}/1</div>
|
485 |
</div>
|
486 |
""", unsafe_allow_html=True)
|
487 |
|
|
|
490 |
# Add a separator between questions
|
491 |
if idx < len(dummydata):
|
492 |
st.markdown('---')
|
493 |
+
|
494 |
+
def predict_with_loaded_model(text):
|
495 |
+
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
|
496 |
+
input_ids = inputs['input_ids'].to(device)
|
497 |
+
model.eval()
|
498 |
+
with torch.no_grad():
|
499 |
+
outputs = model(input_ids)
|
500 |
+
logits = outputs.logits
|
501 |
+
probabilities = softmax(logits, dim=-1)
|
502 |
+
probabilities = probabilities.squeeze().cpu().numpy()
|
503 |
+
# Convert to float and format to 3 decimal places
|
504 |
+
class_probabilities = {reverse_mapping[i]: float(f"{prob:.3f}") for i, prob in enumerate(probabilities)}
|
505 |
+
return class_probabilities
|
506 |
+
|
507 |
+
# def process_document(input_path):
|
508 |
+
# return {'Avg_Confidence': 0.9397169561947093, 'String': ['What are the key differences between classification and regression tasks in', 'supervised learning, and how do you determine which algorithm to use for a', 'specific problem?', 'e How does clustering differ from dimensionality reduction, and can you', 'provide real-world examples of where each is applied?', 'What are common evaluation metrics for classification models, and how do', 'precision, recall, and F1-score relate to each other?', 'How do convolutional neural networks (CNNS) and recurrent neural networks', '(RNNS) differ in their architecture and applications?', 'What steps can be taken to identify and mitigate bias in machine learning', 'models, and why is this an important consideration?']}
|
509 |
+
|
510 |
+
def process_document(input_path):
|
511 |
+
if input_path.lower().endswith(".pdf"):
|
512 |
+
doc = DocumentFile.from_pdf(input_path)
|
513 |
+
#print(f"Number of pages: {len(doc)}")
|
514 |
+
elif input_path.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".tiff")):
|
515 |
+
doc = DocumentFile.from_images(input_path)
|
516 |
+
else:
|
517 |
+
raise ValueError("Unsupported file type. Please provide a PDF or an image file.")
|
518 |
+
result = modelocr(doc)
|
519 |
+
def calculate_average_confidence(result):
|
520 |
+
total_confidence = 0
|
521 |
+
word_count = 0
|
522 |
+
for page in result.pages:
|
523 |
+
for block in page.blocks:
|
524 |
+
for line in block.lines:
|
525 |
+
for word in line.words:
|
526 |
+
total_confidence += word.confidence
|
527 |
+
word_count += 1
|
528 |
+
average_confidence = total_confidence / word_count if word_count > 0 else 0
|
529 |
+
return average_confidence
|
530 |
+
average_confidence = calculate_average_confidence(result)
|
531 |
+
string_result = result.render()
|
532 |
+
return {'Avg_Confidence': average_confidence, 'String':string_result.split('\n')}
|
533 |
+
|
534 |
+
def sendtogemini(inputpath):
|
535 |
+
qw = process_document(inputpath)
|
536 |
+
questionset = str(qw['String'])
|
537 |
+
# send this prompt to gemini :
|
538 |
+
questionset += """You are given a list of text fragments containing questions fragments extracted by an ocr model. Your task is to:
|
539 |
+
# only Merge the question fragments into complete and coherent questions.Don't answer then.
|
540 |
+
# Separate each question , start a new question with @ to make them easily distinguishable for further processing."""
|
541 |
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key={os.getenv('GEMINI_API_KEY')}"
|
542 |
+
|
543 |
+
payload = {
|
544 |
+
"contents": [
|
545 |
+
{
|
546 |
+
"parts": [
|
547 |
+
{"text": questionset}
|
548 |
+
]
|
549 |
+
}
|
550 |
+
]
|
551 |
+
}
|
552 |
+
headers = {"Content-Type": "application/json"}
|
553 |
+
|
554 |
+
response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=60)
|
555 |
+
result = response.json()
|
556 |
+
res1 = result.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "")
|
557 |
+
question = []
|
558 |
+
for i in res1.split('\n'):
|
559 |
+
i = i.strip()
|
560 |
+
if len(i) > 0:
|
561 |
+
if i[0] == '@':
|
562 |
+
question.append(i[1:].strip())
|
563 |
+
data = []
|
564 |
+
for i in question:
|
565 |
+
d = {}
|
566 |
+
d['question'] = i
|
567 |
+
d['score'] = predict_with_loaded_model(i)
|
568 |
+
data.append(d)
|
569 |
+
return data
|
570 |
|
571 |
# Run Streamlit app
|
572 |
if __name__ == "__main__":
|