Spaces:

imseldrith
/

BookTODataset

Build error

App Files Files Community

imseldrith commited on Feb 10, 2023

Commit

5700651

1 Parent(s): 69f0100

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -66

app.py CHANGED Viewed

@@ -1,78 +1,77 @@
-#from flask import Flask, request, render_template
-import requests
 import PyPDF2
-import io
-import os
-import googletrans
 import re
 import pandas as pd
-import gradio as gr
-def upload(url , file):
-    file = gr.Inputs.File('file')
-    url = gr.Textbox('url')
-    if file:
-        filename = file.filename
-        file_extension = os.path.splitext(filename)[1]
-        file.save(filename)
-        # Check file extension and read the content
-        if file_extension == '.pdf':
-            # Read pdf file
-            pdf_file = PyPDF2.PdfFileReader(file)
-            text = ''
-            for page in range(pdf_file.getNumPages()):
-                text += pdf_file.getPage(page).extractText() + ' '
-        elif file_extension == '.txt':
-            # Read txt file
-            text = file.read().decode('utf-8')
-        else:
-            return 'Invalid file format'
-    elif url:
-        response = requests.get(url)
-        file_extension = os.path.splitext(url)[1]
-        # Check file extension and read the content
-        if file_extension == '.pdf':
-            # Read pdf file
-            pdf_file = PyPDF2.PdfFileReader(io.BytesIO(response.content))
-            text = ''
-            for page in range(pdf_file.getNumPages()):
-                text += pdf_file.getPage(page).extractText() + ' '
-        elif file_extension == '.txt':
-            # Read txt file
-            text = response.text
-        else:
-            return 'Invalid file format'
     else:
-        return 'No file or URL found'
-    # Check if the language of the text is English, otherwise translate it
-    try:
-        src_lang = googletrans.LANGUAGES[googletrans.detect(text).lang]
-        if src_lang != 'en':
-            # Initialize the translator
-            translator = googletrans.Translator()
-            # Translate the text to English
-            text = translator.translate(text, dest='en').text
-            # Display a message indicating the text has been translated
-            print('The text has been translated from {} to English'.format(src_lang))
-    except Exception as e:
-        print('Error:', e)
-    # Convert the text to a dataset
-    lines = re.split(r'[.!?]+', text)
-    lines = [line.strip() for line in lines if line.strip() != '']
-    data = {'sentence': lines}
-    df = pd.DataFrame(data)
-    # Save the dataset to a CSV file
-    df.to_csv('dataset.csv', index=False)
-    return 'Dataset created successfully!'
-gr.Interface(upload,inputs=[url or file], outputs="text").launch()

+import gradio as gr
+import urllib.request
 import PyPDF2
 import re
 import pandas as pd
+from tqdm import tqdm
+def extract_text_from_pdf(pdf_file):
+    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
+    text = ""
+    for page in range(pdf_reader.numPages):
+        text += pdf_reader.getPage(page).extractText()
+    return text
+def extract_text_from_txt(txt_file):
+    with open(txt_file, "r") as file:
+        text = file.read()
+    return text
+def book_to_dataset(file, file_type):
+    if file_type == "pdf":
+        text = extract_text_from_pdf(file)
+    elif file_type == "txt":
+        text = extract_text_from_txt(file)
     else:
+        raise ValueError("Invalid file type")
+    words = re.findall(r'\w+', text)
+    words_frequency = {}
+    for word in words:
+        words_frequency[word] = words_frequency.get(word, 0) + 1
+    df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
+    return df
+def book_to_dataset_progress(file, file_type):
+    if file_type == "pdf":
+        text = extract_text_from_pdf(file)
+    elif file_type == "txt":
+        text = extract_text_from_txt(file)
+    else:
+        raise ValueError("Invalid file type")
+    words = re.findall(r'\w+', text)
+    words_frequency = {}
+    for word in tqdm(words, desc="Converting..."):
+        words_frequency[word] = words_frequency.get(word, 0) + 1
+    df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
+    return df
+def book_converter(inputs):
+    if inputs[1] == "URL":
+        url = inputs[0]
+        file_name = url.split("/")[-1]
+        urllib.request.urlretrieve(url, file_name)
+        file = file_name
+        file_type = file_name.split(".")[-1]
+    else:
+        file = inputs[0]
+        file_type = inputs[2].split(".")[-1]
+    return book_to_dataset_progress(file, file_type)
+inputs = gr.inputs.Column(
+    [
+        gr.inputs.Textbox(lines=1, default="Enter URL or choose file", element_type="url"),
+        gr.inputs.Radio(["URL", "File"], default="URL"),
+        gr.inputs.FileUploader(upload_label="Choose file", clear_label="Clear file",)
+    ],
+    label="Input"
+)
+interface = gr.Interface(
+    book_converter,
+    inputs,
+    gr.outputs.Dataframe(),
+    title="Book to Dataset Converter",
+    description="Convert a book in pdf or txt format to a dataset that can be used to train AI models."
+)
+interface.launch()