Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -22,10 +22,12 @@ import numpy as np
|
|
22 |
from google.oauth2.credentials import Credentials
|
23 |
from google_auth_oauthlib.flow import InstalledAppFlow
|
24 |
from googleapiclient.discovery import build
|
|
|
|
|
25 |
|
26 |
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
27 |
GOOGLE_DRIVE_SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
|
28 |
-
CLIENT_SECRET_FILE = '
|
29 |
|
30 |
def authenticate_google_drive():
|
31 |
flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, GOOGLE_DRIVE_SCOPES)
|
@@ -34,14 +36,19 @@ def authenticate_google_drive():
|
|
34 |
|
35 |
def get_file_from_google_drive(drive_service, file_id):
|
36 |
request = drive_service.files().get_media(fileId=file_id)
|
37 |
-
file_content =
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
def extract_text_from_pdf(pdf_content):
|
41 |
-
return extract_text(pdf_content)
|
42 |
|
43 |
def extract_text_from_doc(doc_content):
|
44 |
-
return docx2txt.process(doc_content)
|
45 |
|
46 |
def preprocess_text(text):
|
47 |
text = text.replace('\n', ' ').replace('\r', ' ')
|
@@ -51,10 +58,10 @@ def preprocess_text(text):
|
|
51 |
text = re.sub(r'\s+', ' ', text).strip()
|
52 |
return text
|
53 |
|
54 |
-
def process_files(file_contents: List[
|
55 |
all_text = ""
|
56 |
for file_content in file_contents:
|
57 |
-
if
|
58 |
extracted_text = extract_text_from_pdf(file_content)
|
59 |
else:
|
60 |
extracted_text = extract_text_from_doc(file_content)
|
|
|
22 |
from google.oauth2.credentials import Credentials
|
23 |
from google_auth_oauthlib.flow import InstalledAppFlow
|
24 |
from googleapiclient.discovery import build
|
25 |
+
from googleapiclient.http import MediaIoBaseDownload
|
26 |
+
import io
|
27 |
|
28 |
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
29 |
GOOGLE_DRIVE_SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
|
30 |
+
CLIENT_SECRET_FILE = 'client_secret_64686904440-0a015tg0h941o993tif67c9mq1jr4mio.apps.googleusercontent.com.json'
|
31 |
|
32 |
def authenticate_google_drive():
|
33 |
flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, GOOGLE_DRIVE_SCOPES)
|
|
|
36 |
|
37 |
def get_file_from_google_drive(drive_service, file_id):
|
38 |
request = drive_service.files().get_media(fileId=file_id)
|
39 |
+
file_content = io.BytesIO()
|
40 |
+
downloader = MediaIoBaseDownload(file_content, request, chunksize=1024*1024)
|
41 |
+
done = False
|
42 |
+
while done is False:
|
43 |
+
status, done = downloader.next_chunk()
|
44 |
+
file_content.seek(0)
|
45 |
+
return file_content.read()
|
46 |
|
47 |
def extract_text_from_pdf(pdf_content):
|
48 |
+
return extract_text(io.BytesIO(pdf_content))
|
49 |
|
50 |
def extract_text_from_doc(doc_content):
|
51 |
+
return docx2txt.process(io.BytesIO(doc_content))
|
52 |
|
53 |
def preprocess_text(text):
|
54 |
text = text.replace('\n', ' ').replace('\r', ' ')
|
|
|
58 |
text = re.sub(r'\s+', ' ', text).strip()
|
59 |
return text
|
60 |
|
61 |
+
def process_files(file_contents: List[bytes]):
|
62 |
all_text = ""
|
63 |
for file_content in file_contents:
|
64 |
+
if file_content.startswith(b'%PDF'):
|
65 |
extracted_text = extract_text_from_pdf(file_content)
|
66 |
else:
|
67 |
extracted_text = extract_text_from_doc(file_content)
|