Spaces:
Running
Running
File size: 3,256 Bytes
0f09d43 f456ef3 5692cb3 4107940 46323da 5692cb3 46323da 5692cb3 8de7c36 dbcf2e8 dd7488f 304cf45 f456ef3 1b47089 b8acde7 0f09d43 acb72cc 0f09d43 1b47089 b8acde7 5692cb3 1b47089 4107940 dbcf2e8 5692cb3 4107940 753ae25 4107940 753ae25 4107940 753ae25 4107940 753ae25 4107940 dbcf2e8 46323da 710a34d 46323da 4107940 710a34d 4107940 710a34d 5692cb3 710a34d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import os
import shutil
from inspect import getmembers, isfunction, signature
from io import StringIO
import pandas as pd
import pytesseract
import streamlit as st
from newspaper import Article
from PIL import Image
from PyPDF2 import PdfFileReader
import core.pipelines as pipelines_functions
from core.audio import audio_to_text, load_model
from core.pipelines import data_path
def get_pipelines():
pipeline_names, pipeline_funcs = list(
zip(*getmembers(pipelines_functions, isfunction))
)
pipeline_names = [
" ".join([n.capitalize() for n in name.split("_")]) for name in pipeline_names
]
pipeline_func_parameters = [
{key: value.default for key, value in signature(pipe_func).parameters.items()}
for pipe_func in pipeline_funcs
]
return pipeline_names, pipeline_funcs, pipeline_func_parameters
def reset_vars_data():
st.session_state["doc_id"] = 0
st.session_state["search_results"] = None
# Delete data files
shutil.rmtree(data_path)
os.makedirs(data_path, exist_ok=True)
@st.cache_data
def extract_text_from_url(url: str):
article = Article(url)
article.download()
article.parse()
return article.text
@st.cache_data
def extract_text_from_file(file):
# read text file
if file.type == "text/plain":
# To convert to a string based IO:
stringio = StringIO(file.getvalue().decode("utf-8"))
# To read file as string:
file_text = stringio.read()
return file_text
# read pdf file
elif file.type == "application/pdf":
pdfReader = PdfFileReader(file)
count = pdfReader.numPages
all_text = ""
for i in range(count):
try:
page = pdfReader.getPage(i)
all_text += page.extractText()
except:
continue
file_text = all_text
return file_text
# read csv file
elif file.type == "text/csv":
csv = pd.read_csv(file)
# get columns of type string
string_columns = csv.select_dtypes(include=["object"]).columns
# get data from columns and join it together
file_text = ""
for row in csv[string_columns].values.tolist():
# remove NaNs
row = [x for x in row if str(x) != "nan"]
for column in row:
txt = ""
if isinstance(column, list):
try:
txt = " ".join(column)
except:
continue
elif isinstance(column, str):
txt = column
else:
continue
file_text += " " + txt
return file_text
# read image file (OCR)
elif file.type in ["image/jpeg", "image/png"]:
return pytesseract.image_to_string(Image.open(file))
# read audio file (AudoToText)
elif file.type in ["audio/mpeg", "audio/wav", "audio/aac", "audio/x-m4a"]:
text = audio_to_text(st.session_state["audio_model"], file)
return text
else:
st.warning(f"File type {file.type} not supported")
return None
@st.cache_resource
def load_audio_model():
return load_model()
|