File size: 3,256 Bytes
0f09d43
 
f456ef3
5692cb3
 
4107940
46323da
5692cb3
 
46323da
5692cb3
 
 
 
 
8de7c36
dbcf2e8
dd7488f
 
 
 
 
 
 
304cf45
 
 
 
f456ef3
1b47089
b8acde7
0f09d43
 
acb72cc
0f09d43
 
 
1b47089
b8acde7
5692cb3
1b47089
 
 
 
 
 
4107940
dbcf2e8
5692cb3
4107940
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753ae25
4107940
 
 
 
753ae25
4107940
 
 
 
753ae25
4107940
 
 
 
753ae25
4107940
 
 
 
 
 
 
 
dbcf2e8
46323da
710a34d
46323da
4107940
710a34d
 
 
 
 
4107940
 
 
710a34d
 
5692cb3
710a34d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import shutil
from inspect import getmembers, isfunction, signature
from io import StringIO

import pandas as pd
import pytesseract
import streamlit as st
from newspaper import Article
from PIL import Image
from PyPDF2 import PdfFileReader

import core.pipelines as pipelines_functions
from core.audio import audio_to_text, load_model
from core.pipelines import data_path


def get_pipelines():
    pipeline_names, pipeline_funcs = list(
        zip(*getmembers(pipelines_functions, isfunction))
    )
    pipeline_names = [
        " ".join([n.capitalize() for n in name.split("_")]) for name in pipeline_names
    ]
    pipeline_func_parameters = [
        {key: value.default for key, value in signature(pipe_func).parameters.items()}
        for pipe_func in pipeline_funcs
    ]
    return pipeline_names, pipeline_funcs, pipeline_func_parameters


def reset_vars_data():
    st.session_state["doc_id"] = 0
    st.session_state["search_results"] = None
    # Delete data files
    shutil.rmtree(data_path)
    os.makedirs(data_path, exist_ok=True)


@st.cache_data
def extract_text_from_url(url: str):
    article = Article(url)
    article.download()
    article.parse()

    return article.text


@st.cache_data
def extract_text_from_file(file):
    # read text file
    if file.type == "text/plain":
        # To convert to a string based IO:
        stringio = StringIO(file.getvalue().decode("utf-8"))

        # To read file as string:
        file_text = stringio.read()

        return file_text

    # read pdf file
    elif file.type == "application/pdf":
        pdfReader = PdfFileReader(file)
        count = pdfReader.numPages
        all_text = ""

        for i in range(count):
            try:
                page = pdfReader.getPage(i)
                all_text += page.extractText()
            except:
                continue
        file_text = all_text

        return file_text

    # read csv file
    elif file.type == "text/csv":
        csv = pd.read_csv(file)
        # get columns of type string
        string_columns = csv.select_dtypes(include=["object"]).columns
        # get data from columns and join it together
        file_text = ""
        for row in csv[string_columns].values.tolist():
            # remove NaNs
            row = [x for x in row if str(x) != "nan"]
            for column in row:
                txt = ""
                if isinstance(column, list):
                    try:
                        txt = " ".join(column)
                    except:
                        continue
                elif isinstance(column, str):
                    txt = column
                else:
                    continue
                file_text += " " + txt
        return file_text

    # read image file (OCR)
    elif file.type in ["image/jpeg", "image/png"]:
        return pytesseract.image_to_string(Image.open(file))

    # read audio file (AudoToText)
    elif file.type in ["audio/mpeg", "audio/wav", "audio/aac", "audio/x-m4a"]:
        text = audio_to_text(st.session_state["audio_model"], file)
        return text

    else:
        st.warning(f"File type {file.type} not supported")
        return None


@st.cache_resource
def load_audio_model():
    return load_model()