Luciferalive commited on
Commit
6bc73f4
·
verified ·
1 Parent(s): e7df89b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -7
app.py CHANGED
@@ -22,10 +22,12 @@ import numpy as np
22
  from google.oauth2.credentials import Credentials
23
  from google_auth_oauthlib.flow import InstalledAppFlow
24
  from googleapiclient.discovery import build
 
 
25
 
26
  HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
27
  GOOGLE_DRIVE_SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
28
- CLIENT_SECRET_FILE = 'path/to/client_secret.json'
29
 
30
  def authenticate_google_drive():
31
  flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, GOOGLE_DRIVE_SCOPES)
@@ -34,14 +36,19 @@ def authenticate_google_drive():
34
 
35
  def get_file_from_google_drive(drive_service, file_id):
36
  request = drive_service.files().get_media(fileId=file_id)
37
- file_content = request.execute()
38
- return file_content
 
 
 
 
 
39
 
40
  def extract_text_from_pdf(pdf_content):
41
- return extract_text(pdf_content)
42
 
43
  def extract_text_from_doc(doc_content):
44
- return docx2txt.process(doc_content)
45
 
46
  def preprocess_text(text):
47
  text = text.replace('\n', ' ').replace('\r', ' ')
@@ -51,10 +58,10 @@ def preprocess_text(text):
51
  text = re.sub(r'\s+', ' ', text).strip()
52
  return text
53
 
54
- def process_files(file_contents: List[str]):
55
  all_text = ""
56
  for file_content in file_contents:
57
- if isinstance(file_content, bytes):
58
  extracted_text = extract_text_from_pdf(file_content)
59
  else:
60
  extracted_text = extract_text_from_doc(file_content)
 
22
  from google.oauth2.credentials import Credentials
23
  from google_auth_oauthlib.flow import InstalledAppFlow
24
  from googleapiclient.discovery import build
25
+ from googleapiclient.http import MediaIoBaseDownload
26
+ import io
27
 
28
  HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
29
  GOOGLE_DRIVE_SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
30
+ CLIENT_SECRET_FILE = 'client_secret_64686904440-0a015tg0h941o993tif67c9mq1jr4mio.apps.googleusercontent.com.json'
31
 
32
  def authenticate_google_drive():
33
  flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, GOOGLE_DRIVE_SCOPES)
 
36
 
37
  def get_file_from_google_drive(drive_service, file_id):
38
  request = drive_service.files().get_media(fileId=file_id)
39
+ file_content = io.BytesIO()
40
+ downloader = MediaIoBaseDownload(file_content, request, chunksize=1024*1024)
41
+ done = False
42
+ while done is False:
43
+ status, done = downloader.next_chunk()
44
+ file_content.seek(0)
45
+ return file_content.read()
46
 
47
  def extract_text_from_pdf(pdf_content):
48
+ return extract_text(io.BytesIO(pdf_content))
49
 
50
  def extract_text_from_doc(doc_content):
51
+ return docx2txt.process(io.BytesIO(doc_content))
52
 
53
  def preprocess_text(text):
54
  text = text.replace('\n', ' ').replace('\r', ' ')
 
58
  text = re.sub(r'\s+', ' ', text).strip()
59
  return text
60
 
61
+ def process_files(file_contents: List[bytes]):
62
  all_text = ""
63
  for file_content in file_contents:
64
+ if file_content.startswith(b'%PDF'):
65
  extracted_text = extract_text_from_pdf(file_content)
66
  else:
67
  extracted_text = extract_text_from_doc(file_content)