AruniAnkur commited on
Commit
40f5f78
Β·
verified Β·
1 Parent(s): 8c84d02

added full code

Browse files
Files changed (1) hide show
  1. app.py +134 -74
app.py CHANGED
@@ -8,9 +8,22 @@ import tempfile
8
  import base64
9
  import dotenv
10
  from dotenv import load_dotenv
 
 
 
 
 
11
 
12
  load_dotenv()
13
 
 
 
 
 
 
 
 
 
14
  # Previous functions from Question Generator
15
  def get_pdf_path(pdf_source=None, uploaded_file=None):
16
  try:
@@ -183,63 +196,23 @@ def process_pdf_and_generate_questions(pdf_source, uploaded_file, api_key, role_
183
  st.error(f"Error processing PDF and generating questions: {e}")
184
  return []
185
 
186
- dummydata = [
187
- {"question": "What is the main idea of the paper?", "score": {
188
- "Knowledge": 10,
189
- "Comprehension": 9,
190
- "Application": 8,
191
- "Analysis": 7,
192
- "Synthesis": 6,
193
- "Evaluation": 5
194
- }},
195
- {"question": "What are the key findings of the paper?", "score": {
196
- "Knowledge": 9,
197
- "Comprehension": 8,
198
- "Application": 7,
199
- "Analysis": 6,
200
- "Synthesis": 5,
201
- "Evaluation": 4
202
- }},
203
- {"question": "How does the paper contribute to the field?", "score": {
204
- "Knowledge": 8,
205
- "Comprehension": 7,
206
- "Application": 6,
207
- "Analysis": 5,
208
- "Synthesis": 4,
209
- "Evaluation": 3
210
- }},
211
- {"question": "What are the limitations of the paper?", "score": {
212
- "Knowledge": 7,
213
- "Comprehension": 6,
214
- "Application": 5,
215
- "Analysis": 4,
216
- "Synthesis": 3,
217
- "Evaluation": 2
218
- }},
219
- {"question": "What are the future research directions?", "score": {
220
- "Knowledge": 6,
221
- "Comprehension": 5,
222
- "Application": 4,
223
- "Analysis": 3,
224
- "Synthesis": 2,
225
- "Evaluation": 1
226
- }},
227
- {"question": "How does the paper compare to existing work?", "score": {
228
- "Knowledge": 5,
229
- "Comprehension": 4,
230
- "Application": 3,
231
- "Analysis": 2,
232
- "Synthesis": 1,
233
- "Evaluation": 0
234
- }
235
- }
236
-
237
- ]
238
-
239
  def main():
240
  st.set_page_config(page_title="Academic Paper Tool", page_icon="πŸ“", layout="wide")
241
-
242
  # Tabs for different functionalities
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  tab1, tab2 = st.tabs(["Question Generator", "Paper Scorer"])
244
 
245
  if 'totalscore' not in st.session_state:
@@ -275,6 +248,7 @@ def main():
275
 
276
  st.session_state.pdf_url = st.text_input(
277
  "Enter the URL of the PDF",
 
278
  key="pdf_url_input"
279
  )
280
 
@@ -445,7 +419,6 @@ def main():
445
  type=['pdf','jpg','png','jpeg'],
446
  label_visibility="collapsed"
447
  )
448
-
449
  # Custom submit button with some styling
450
  submit_button = st.form_submit_button(
451
  "Score Paper",
@@ -455,17 +428,33 @@ def main():
455
 
456
  if submit_button:
457
  # Calculate total score
458
- total_score = sum(
459
- sum(question['score'].values())
460
- for question in dummydata
461
- )
462
- average_score = total_score / (len(dummydata) * 6 * 10) * 100
 
 
 
 
463
 
464
  # Score display columns
465
- col1, col2 = st.columns([2,1])
466
-
467
- with col1:
468
- st.metric(label="Total Paper Score", value=f"{average_score:.2f}/100")
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
  with st.expander("Show Detailed Scores", expanded=True):
471
  for idx, item in enumerate(dummydata, 1):
@@ -477,13 +466,13 @@ def main():
477
  score_cols = st.columns(6)
478
 
479
  # Scoring categories
480
- categories = ['Knowledge', 'Comprehension', 'Application', 'Analysis', 'Synthesis', 'Evaluation']
481
 
482
  for col, category in zip(score_cols, categories):
483
  with col:
484
  # Determine color based on score
485
- score = item['score'][category]
486
- color = 'green' if score > 7 else 'orange' if score > 4 else 'red'
487
 
488
  st.markdown(f"""
489
  <div style="text-align: center;
@@ -492,7 +481,7 @@ def main():
492
  padding: 5px;
493
  margin-bottom: 5px;">
494
  <div style="font-weight: bold; color: {color};">{category}</div>
495
- <div style="font-size: 18px; color: {color};">{score}/10</div>
496
  </div>
497
  """, unsafe_allow_html=True)
498
 
@@ -501,12 +490,83 @@ def main():
501
  # Add a separator between questions
502
  if idx < len(dummydata):
503
  st.markdown('---')
504
- # but = st.button("Show Detailed Scores")
505
- # if but:
506
- # st.write("Detailed Scores")
507
- # with st.container():
508
- # for key, value in dummydata.items():
509
- # st.write(f"{key}: {value}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
 
511
  # Run Streamlit app
512
  if __name__ == "__main__":
 
8
  import base64
9
  import dotenv
10
  from dotenv import load_dotenv
11
+ import torch
12
+ from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
13
+ from torch.nn.functional import softmax
14
+ from doctr.models import ocr_predictor
15
+ from doctr.io import DocumentFile
16
 
17
  load_dotenv()
18
 
19
+ model = DistilBertForSequenceClassification.from_pretrained('./fine_tuned_distilbert')
20
+ tokenizer = DistilBertTokenizer.from_pretrained('./fine_tuned_distilbert')
21
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
+ model.to(device)
23
+ mapping = {"Remembering": 0, "Understanding": 1, "Applying": 2, "Analyzing": 3, "Evaluating": 4, "Creating": 5}
24
+ reverse_mapping = {v: k for k, v in mapping.items()}
25
+ modelocr = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
26
+
27
  # Previous functions from Question Generator
28
  def get_pdf_path(pdf_source=None, uploaded_file=None):
29
  try:
 
196
  st.error(f"Error processing PDF and generating questions: {e}")
197
  return []
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  def main():
200
  st.set_page_config(page_title="Academic Paper Tool", page_icon="πŸ“", layout="wide")
 
201
  # Tabs for different functionalities
202
+ st.markdown("""
203
+ <style>
204
+ .stTabs [data-baseweb="tab"] {
205
+ margin-bottom: 1rem;
206
+ flex: 1;
207
+ justify-content: center;
208
+ }
209
+ .stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
210
+ font-size:2rem;
211
+ padding: 0 2rem;
212
+ margin: 0;
213
+ }
214
+ </style>
215
+ """, unsafe_allow_html=True)
216
  tab1, tab2 = st.tabs(["Question Generator", "Paper Scorer"])
217
 
218
  if 'totalscore' not in st.session_state:
 
248
 
249
  st.session_state.pdf_url = st.text_input(
250
  "Enter the URL of the PDF",
251
+ value=st.session_state.pdf_url,
252
  key="pdf_url_input"
253
  )
254
 
 
419
  type=['pdf','jpg','png','jpeg'],
420
  label_visibility="collapsed"
421
  )
 
422
  # Custom submit button with some styling
423
  submit_button = st.form_submit_button(
424
  "Score Paper",
 
428
 
429
  if submit_button:
430
  # Calculate total score
431
+ print(uploaded_file.name)
432
+ dummydata = sendtogemini(uploaded_file.name)
433
+ print(dummydata)
434
+ total_score = {'Remembering': 0, 'Understanding': 0, 'Applying': 0, 'Analyzing': 0, 'Evaluating': 0, 'Creating': 0}
435
+ for item in dummydata:
436
+ for category in total_score:
437
+ total_score[category] += item['score'][category]
438
+
439
+ # average_score = total_score / (len(dummydata) * 6 * 10) * 100
440
 
441
  # Score display columns
442
+ categories = ['Remembering', 'Understanding', 'Applying', 'Analyzing', 'Evaluating', 'Creating']
443
+
444
+ # Create 6 columns in a single row
445
+ cols = st.columns(6)
446
+
447
+ # Iterate through categories and populate columns
448
+ for i, category in enumerate(categories):
449
+ with cols[i]:
450
+ score = round(total_score[category] / (len(dummydata) ),ndigits=3)
451
+ color = 'green' if score > .7 else 'orange' if score > .4 else 'red'
452
+ st.markdown(f"""
453
+ <div class="score-breakdown">
454
+ <div class="score-header" style="color: {color}">{category}</div>
455
+ <div style="font-size: 24px; color: {color};">{score}/1</div>
456
+ </div>
457
+ """, unsafe_allow_html=True)
458
 
459
  with st.expander("Show Detailed Scores", expanded=True):
460
  for idx, item in enumerate(dummydata, 1):
 
466
  score_cols = st.columns(6)
467
 
468
  # Scoring categories
469
+ categories = ['Remembering', 'Understanding', 'Applying', 'Analyzing', 'Evaluating', 'Creating']
470
 
471
  for col, category in zip(score_cols, categories):
472
  with col:
473
  # Determine color based on score
474
+ score = round(item['score'][category],ndigits=3)
475
+ color = 'green' if score > .7 else 'orange' if score > .4 else 'red'
476
 
477
  st.markdown(f"""
478
  <div style="text-align: center;
 
481
  padding: 5px;
482
  margin-bottom: 5px;">
483
  <div style="font-weight: bold; color: {color};">{category}</div>
484
+ <div style="font-size: 18px; color: {color};">{score}/1</div>
485
  </div>
486
  """, unsafe_allow_html=True)
487
 
 
490
  # Add a separator between questions
491
  if idx < len(dummydata):
492
  st.markdown('---')
493
+
494
+ def predict_with_loaded_model(text):
495
+ inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
496
+ input_ids = inputs['input_ids'].to(device)
497
+ model.eval()
498
+ with torch.no_grad():
499
+ outputs = model(input_ids)
500
+ logits = outputs.logits
501
+ probabilities = softmax(logits, dim=-1)
502
+ probabilities = probabilities.squeeze().cpu().numpy()
503
+ # Convert to float and format to 3 decimal places
504
+ class_probabilities = {reverse_mapping[i]: float(f"{prob:.3f}") for i, prob in enumerate(probabilities)}
505
+ return class_probabilities
506
+
507
+ # def process_document(input_path):
508
+ # return {'Avg_Confidence': 0.9397169561947093, 'String': ['What are the key differences between classification and regression tasks in', 'supervised learning, and how do you determine which algorithm to use for a', 'specific problem?', 'e How does clustering differ from dimensionality reduction, and can you', 'provide real-world examples of where each is applied?', 'What are common evaluation metrics for classification models, and how do', 'precision, recall, and F1-score relate to each other?', 'How do convolutional neural networks (CNNS) and recurrent neural networks', '(RNNS) differ in their architecture and applications?', 'What steps can be taken to identify and mitigate bias in machine learning', 'models, and why is this an important consideration?']}
509
+
510
+ def process_document(input_path):
511
+ if input_path.lower().endswith(".pdf"):
512
+ doc = DocumentFile.from_pdf(input_path)
513
+ #print(f"Number of pages: {len(doc)}")
514
+ elif input_path.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".tiff")):
515
+ doc = DocumentFile.from_images(input_path)
516
+ else:
517
+ raise ValueError("Unsupported file type. Please provide a PDF or an image file.")
518
+ result = modelocr(doc)
519
+ def calculate_average_confidence(result):
520
+ total_confidence = 0
521
+ word_count = 0
522
+ for page in result.pages:
523
+ for block in page.blocks:
524
+ for line in block.lines:
525
+ for word in line.words:
526
+ total_confidence += word.confidence
527
+ word_count += 1
528
+ average_confidence = total_confidence / word_count if word_count > 0 else 0
529
+ return average_confidence
530
+ average_confidence = calculate_average_confidence(result)
531
+ string_result = result.render()
532
+ return {'Avg_Confidence': average_confidence, 'String':string_result.split('\n')}
533
+
534
+ def sendtogemini(inputpath):
535
+ qw = process_document(inputpath)
536
+ questionset = str(qw['String'])
537
+ # send this prompt to gemini :
538
+ questionset += """You are given a list of text fragments containing questions fragments extracted by an ocr model. Your task is to:
539
+ # only Merge the question fragments into complete and coherent questions.Don't answer then.
540
+ # Separate each question , start a new question with @ to make them easily distinguishable for further processing."""
541
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key={os.getenv('GEMINI_API_KEY')}"
542
+
543
+ payload = {
544
+ "contents": [
545
+ {
546
+ "parts": [
547
+ {"text": questionset}
548
+ ]
549
+ }
550
+ ]
551
+ }
552
+ headers = {"Content-Type": "application/json"}
553
+
554
+ response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=60)
555
+ result = response.json()
556
+ res1 = result.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "")
557
+ question = []
558
+ for i in res1.split('\n'):
559
+ i = i.strip()
560
+ if len(i) > 0:
561
+ if i[0] == '@':
562
+ question.append(i[1:].strip())
563
+ data = []
564
+ for i in question:
565
+ d = {}
566
+ d['question'] = i
567
+ d['score'] = predict_with_loaded_model(i)
568
+ data.append(d)
569
+ return data
570
 
571
  # Run Streamlit app
572
  if __name__ == "__main__":