amendolajine commited on
Commit
b178a19
·
1 Parent(s): 8766103

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -25
app.py CHANGED
@@ -28,34 +28,29 @@ def extract_abstract(pdf_bytes):
28
  return "Error in abstract extraction"
29
 
30
  def process_text(uploaded_file):
31
- # Debugging: Print the type and path of the uploaded_file
32
- print(f"Uploaded file type: {type(uploaded_file)}")
33
- print(f"Uploaded file path: {uploaded_file}")
34
 
35
- # Read PDF file from the path
36
  try:
37
  with open(uploaded_file, "rb") as file:
38
  pdf_bytes = file.read()
39
  except Exception as e:
40
- print(f"Error reading file from path: {e}")
41
  return "Error reading PDF file", None
42
 
43
  try:
44
  abstract_text = extract_abstract(pdf_bytes)
45
- logging.info(f"Extracted abstract: {abstract_text[:200]}...") # Log first 200 chars of abstract
46
  except Exception as e:
47
  logging.error(f"Error in abstract extraction: {e}")
48
  return "Error in processing PDF", None
49
 
50
  try:
51
- # Prepare inputs for the model
52
  inputs = tokenizer([abstract_text], max_length=1024, return_tensors='pt', truncation=True, padding="max_length")
53
-
54
- # Generate summary
55
  summary_ids = model.generate(
56
  input_ids=inputs['input_ids'],
57
- attention_mask=inputs['attention_mask'], # Include attention mask
58
- pad_token_id=model.config.pad_token_id, # Include pad token id
59
  num_beams=4,
60
  max_length=40,
61
  min_length=10,
@@ -65,30 +60,27 @@ def process_text(uploaded_file):
65
  )
66
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
67
 
68
- # Post-process the summary
69
- #words = summary.split()
70
- #cleaned_summary = []
71
- #for i, word in enumerate(words):
72
- # if '-' in word and i < len(words) - 1:
73
- # word = word.replace('-', '') + words[i + 1]
74
- # words[i + 1] = ""
75
 
76
- # if '.' in word and i != len(words) - 1:
77
- # word = word.replace('.', '')
78
- # cleaned_summary.append(word + ' and')
79
- # else:
80
- # cleaned_summary.append(word)
81
 
82
  final_summary = ' '.join(cleaned_summary)
83
  final_summary = final_summary[0].upper() + final_summary[1:]
84
  final_summary = ' '.join(w[0].lower() + w[1:] if w.lower() != 'and' else w for w in final_summary.split())
85
 
86
- # Convert summary to speech
87
  speech = synthesiser(final_summary, forward_params={"do_sample": True})
88
  audio_data = speech["audio"].squeeze()
89
  normalized_audio_data = np.int16(audio_data / np.max(np.abs(audio_data)) * 32767)
90
 
91
- # Save audio to temporary file
92
  output_file = "temp_output.wav"
93
  scipy.io.wavfile.write(output_file, rate=speech["sampling_rate"], data=normalized_audio_data)
94
 
 
28
  return "Error in abstract extraction"
29
 
30
  def process_text(uploaded_file):
31
+ logging.debug(f"Uploaded file type: {type(uploaded_file)}")
32
+ logging.debug(f"Uploaded file content: {uploaded_file}")
 
33
 
 
34
  try:
35
  with open(uploaded_file, "rb") as file:
36
  pdf_bytes = file.read()
37
  except Exception as e:
38
+ logging.error(f"Error reading file from path: {e}")
39
  return "Error reading PDF file", None
40
 
41
  try:
42
  abstract_text = extract_abstract(pdf_bytes)
43
+ logging.info(f"Extracted abstract: {abstract_text[:200]}...")
44
  except Exception as e:
45
  logging.error(f"Error in abstract extraction: {e}")
46
  return "Error in processing PDF", None
47
 
48
  try:
 
49
  inputs = tokenizer([abstract_text], max_length=1024, return_tensors='pt', truncation=True, padding="max_length")
 
 
50
  summary_ids = model.generate(
51
  input_ids=inputs['input_ids'],
52
+ attention_mask=inputs['attention_mask'],
53
+ pad_token_id=model.config.pad_token_id,
54
  num_beams=4,
55
  max_length=40,
56
  min_length=10,
 
60
  )
61
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
62
 
63
+ words = summary.split()
64
+ cleaned_summary = []
65
+ for i, word in enumerate(words):
66
+ if '-' in word and i < len(words) - 1:
67
+ word = word.replace('-', '') + words[i + 1]
68
+ words[i + 1] = ""
 
69
 
70
+ if '.' in word and i != len(words) - 1:
71
+ word = word.replace('.', '')
72
+ cleaned_summary.append(word + ' and')
73
+ else:
74
+ cleaned_summary.append(word)
75
 
76
  final_summary = ' '.join(cleaned_summary)
77
  final_summary = final_summary[0].upper() + final_summary[1:]
78
  final_summary = ' '.join(w[0].lower() + w[1:] if w.lower() != 'and' else w for w in final_summary.split())
79
 
 
80
  speech = synthesiser(final_summary, forward_params={"do_sample": True})
81
  audio_data = speech["audio"].squeeze()
82
  normalized_audio_data = np.int16(audio_data / np.max(np.abs(audio_data)) * 32767)
83
 
 
84
  output_file = "temp_output.wav"
85
  scipy.io.wavfile.write(output_file, rate=speech["sampling_rate"], data=normalized_audio_data)
86