File size: 2,498 Bytes
c0b0220
1f46a45
c529144
 
 
 
1f46a45
c529144
c0b0220
1f46a45
 
c0b0220
 
 
c529144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0b0220
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#from flask import Flask, request, render_template
import requests
import PyPDF2
import io
import os
import googletrans
import re
import pandas as pd
import gradio as gr


def upload(url or file):
    file = gr.Inputs.File('file')
    url = gr.Textbox('url')

    if file:
        filename = file.filename
        file_extension = os.path.splitext(filename)[1]
        file.save(filename)

        # Check file extension and read the content
        if file_extension == '.pdf':
            # Read pdf file
            pdf_file = PyPDF2.PdfFileReader(file)
            text = ''
            for page in range(pdf_file.getNumPages()):
                text += pdf_file.getPage(page).extractText() + ' '
        elif file_extension == '.txt':
            # Read txt file
            text = file.read().decode('utf-8')
        else:
            return 'Invalid file format'
    elif url:
        response = requests.get(url)
        file_extension = os.path.splitext(url)[1]

        # Check file extension and read the content
        if file_extension == '.pdf':
            # Read pdf file
            pdf_file = PyPDF2.PdfFileReader(io.BytesIO(response.content))
            text = ''
            for page in range(pdf_file.getNumPages()):
                text += pdf_file.getPage(page).extractText() + ' '
        elif file_extension == '.txt':
            # Read txt file
            text = response.text
        else:
            return 'Invalid file format'
    else:
        return 'No file or URL found'

    # Check if the language of the text is English, otherwise translate it
    try:
        src_lang = googletrans.LANGUAGES[googletrans.detect(text).lang]
        if src_lang != 'en':
            # Initialize the translator
            translator = googletrans.Translator()

            # Translate the text to English
            text = translator.translate(text, dest='en').text

            # Display a message indicating the text has been translated
            print('The text has been translated from {} to English'.format(src_lang))
    except Exception as e:
        print('Error:', e)

    # Convert the text to a dataset
    lines = re.split(r'[.!?]+', text)
    lines = [line.strip() for line in lines if line.strip() != '']
    data = {'sentence': lines}
    df = pd.DataFrame(data)

    # Save the dataset to a CSV file
    df.to_csv('dataset.csv', index=False)

    return 'Dataset created successfully!'

gr.Interface(upload,inputs=[url or file], outputs="text").launch()