Spaces:

Luciferalive
/

Rag-v10

Sleeping

App Files Files Community

Luciferalive commited on May 13, 2024

Commit

6bc73f4

verified ·

1 Parent(s): e7df89b

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -7

app.py CHANGED Viewed

@@ -22,10 +22,12 @@ import numpy as np
 from google.oauth2.credentials import Credentials
 from google_auth_oauthlib.flow import InstalledAppFlow
 from googleapiclient.discovery import build
 HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 GOOGLE_DRIVE_SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
-CLIENT_SECRET_FILE = 'path/to/client_secret.json'
 def authenticate_google_drive():
     flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, GOOGLE_DRIVE_SCOPES)
@@ -34,14 +36,19 @@ def authenticate_google_drive():
 def get_file_from_google_drive(drive_service, file_id):
     request = drive_service.files().get_media(fileId=file_id)
-    file_content = request.execute()
-    return file_content
 def extract_text_from_pdf(pdf_content):
-    return extract_text(pdf_content)
 def extract_text_from_doc(doc_content):
-    return docx2txt.process(doc_content)
 def preprocess_text(text):
     text = text.replace('\n', ' ').replace('\r', ' ')
@@ -51,10 +58,10 @@ def preprocess_text(text):
     text = re.sub(r'\s+', ' ', text).strip()
     return text
-def process_files(file_contents: List[str]):
     all_text = ""
     for file_content in file_contents:
-        if isinstance(file_content, bytes):
             extracted_text = extract_text_from_pdf(file_content)
         else:
             extracted_text = extract_text_from_doc(file_content)

 from google.oauth2.credentials import Credentials
 from google_auth_oauthlib.flow import InstalledAppFlow
 from googleapiclient.discovery import build
+from googleapiclient.http import MediaIoBaseDownload
+import io
 HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 GOOGLE_DRIVE_SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
+CLIENT_SECRET_FILE = 'client_secret_64686904440-0a015tg0h941o993tif67c9mq1jr4mio.apps.googleusercontent.com.json'
 def authenticate_google_drive():
     flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, GOOGLE_DRIVE_SCOPES)
 def get_file_from_google_drive(drive_service, file_id):
     request = drive_service.files().get_media(fileId=file_id)
+    file_content = io.BytesIO()
+    downloader = MediaIoBaseDownload(file_content, request, chunksize=1024*1024)
+    done = False
+    while done is False:
+        status, done = downloader.next_chunk()
+    file_content.seek(0)
+    return file_content.read()
 def extract_text_from_pdf(pdf_content):
+    return extract_text(io.BytesIO(pdf_content))
 def extract_text_from_doc(doc_content):
+    return docx2txt.process(io.BytesIO(doc_content))
 def preprocess_text(text):
     text = text.replace('\n', ' ').replace('\r', ' ')
     text = re.sub(r'\s+', ' ', text).strip()
     return text
+def process_files(file_contents: List[bytes]):
     all_text = ""
     for file_content in file_contents:
+        if file_content.startswith(b'%PDF'):
             extracted_text = extract_text_from_pdf(file_content)
         else:
             extracted_text = extract_text_from_doc(file_content)