aiben / openai_server /agent_tools /convert_document_to_text.py
abugaber's picture
Upload folder using huggingface_hub
3943768 verified
raw
history blame
9.6 kB
import argparse
import sys
import uuid
if 'src' not in sys.path:
sys.path.append('src')
from src.function_client import get_data_h2ogpt
def has_gpu():
import subprocess
try:
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return result.returncode == 0
except FileNotFoundError:
return False
def pdf_has_images(pdf_path):
import fitz
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images()
if image_list:
# print(f"Page {page_num + 1} contains {len(image_list)} image(s)")
return True
# print("No images found in the PDF")
return False
def get_num_pages(file):
try:
import fitz
src = fitz.open(file)
return len(src)
except:
return None
def convert_to_csv(file):
import pandas as pd
# read the xls or xlsx file
if file.lower().endswith('.xls') or file.lower().endswith('.xlsx'):
df = pd.read_excel(file)
new_file = file.replace('.xls', '.csv').replace('.xlsx', '.csv')
try:
df.to_csv(new_file, index=False)
print(f"Converted {file} to CSV for data analysis as {new_file}")
except Exception as e:
pass
def sources_to_text(sources1):
each_content1 = []
all_content1 = ''
for source in sources1:
meta_str = ''
meta = source.metadata
if 'source' in meta:
meta_str += f"Source: {meta['source']}\n"
if 'parser' in meta:
meta_str += f"Parser: {meta['parser']}\n"
if 'title' in meta:
meta_str += f"Title: {meta['title']}\n"
if 'page' in meta:
meta_str += f"Page: {meta['page']}\n"
content1 = f"""\n<document>\n{meta_str}\n<text>\n{source.page_content}\n</text>\n</document>\n"""
each_content1.append(content1)
all_content1 += content1
return all_content1, each_content1
def process_files(files, urls):
text_context_list = []
succeeded = []
textual_types = ('.txt', '.csv', '.toml', '.py', '.rst', '.rtf', '.md', '.html', '.htm', '.xml', '.json', '.yaml',
'.yml', '.ini', '.log', '.tex', '.sql', '.sh', '.bat', '.js', '.css', '.php', '.jsp', '.pl', '.r',
'.lua', '.conf', '.properties', '.tsv', '.xhtml', '.srt', '.vtt', '.cpp', '.c', '.h', '.go')
doc_types = ('.pdf', '.docx', '.doc', '.epub', '.pptx', '.ppt', '.xls', '.xlsx')
from openai_server.agent_tools.common.utils import filename_is_url
files_new = []
urls_new = []
for filename in files + urls:
if filename in files:
if filename_is_url(filename):
urls_new.append(filename)
else:
files_new.append(filename)
else:
urls_new.append(filename)
files = files_new
urls = urls_new
from openai_server.agent_tools.common.utils import download_simple
for filename in files + urls:
enable_transcriptions = False
enable_llava = False
if filename.lower().endswith('.pdf'):
if filename in urls:
newfile = download_simple(filename)
num_pages = get_num_pages(newfile)
has_images = pdf_has_images(newfile)
else:
num_pages = get_num_pages(filename)
has_images = pdf_has_images(filename)
if num_pages and num_pages < 20:
if has_images:
enable_pdf_doctr = 'on'
use_pypdf = 'off'
else:
enable_pdf_doctr = 'off'
use_pypdf = 'on'
use_pymupdf = 'off'
else:
enable_pdf_doctr = 'off'
use_pymupdf = 'on'
use_pypdf = 'off'
else:
# non-pdf, allow docTR in case, e.g. video
enable_pdf_doctr = 'on'
use_pymupdf = 'on'
use_pypdf = 'off'
enable_transcriptions = True
enable_llava = True
if filename.lower().endswith('.xls') or filename.lower().endswith('.xlsx'):
if filename in urls:
xls_file = download_simple(filename)
else:
xls_file = filename
convert_to_csv(xls_file)
sources1, known_type = get_data_h2ogpt(filename,
is_url=filename in urls,
verbose=False,
use_pymupdf=use_pymupdf,
use_pypdf=use_pypdf,
use_unstructured_pdf='off',
enable_pdf_ocr='off',
enable_pdf_doctr=enable_pdf_doctr,
try_pdf_as_html='off',
enable_captions=False, # no need if llava used
enable_llava=enable_llava,
chunk=False,
enable_transcriptions=enable_transcriptions,
)
all_content1, each_content1 = sources_to_text(sources1)
if filename.lower().endswith('.pdf') and enable_pdf_doctr == 'off':
if use_pymupdf == 'on':
use_pymupdf = 'off'
use_pypdf = 'on'
else:
use_pymupdf = 'on'
use_pypdf = 'off'
sources2, known_type = get_data_h2ogpt(filename,
is_url=filename in urls,
verbose=False,
use_pymupdf=use_pymupdf,
use_pypdf=use_pypdf,
use_unstructured_pdf='off',
enable_pdf_ocr='off',
enable_pdf_doctr=enable_pdf_doctr,
try_pdf_as_html='off',
enable_captions=False,
enable_llava=False,
chunk=False,
enable_transcriptions=False,
)
all_content2, each_content2 = sources_to_text(sources2)
# choose one with more content in case pymupdf fails to find info
if len(all_content2) > len(all_content1):
each_content1 = each_content2
if not sources1:
succeeded.append(False)
print(f"Unable to handle file type for {filename}")
else:
succeeded.append(True)
text_context_list.extend(each_content1)
return text_context_list, any(succeeded)
def get_text(files, urls):
text_context_list, any_succeeded = process_files(files, urls)
# Join the text_context_list into a single string
if any_succeeded:
output_text = "\n\n".join(text_context_list)
else:
output_text = None
return output_text
def main():
parser = argparse.ArgumentParser(description="Converts document to text")
parser.add_argument("--files", nargs="+", required=False, help="Files to convert to text")
parser.add_argument("--urls", nargs="+", required=False, help="URLs to convert to text")
parser.add_argument("--output", type=str, required=False, help="Output filename")
args = parser.parse_args()
if not args.output:
args.output = f"conversion_to_text_{str(uuid.uuid4())[:6]}.txt"
files = args.files or []
urls = args.urls or []
output_text = get_text(files, urls)
# Write the output to the specified file
if output_text is not None:
with open(args.output, "w") as f:
f.write(output_text)
print(f"{files + urls} have been converted to text and written to {args.output}")
print(
"The output may be complex for input of PDFs or URLs etc., so do not assume the structure of the output file and instead check it directly.")
print("Probably a verify any use of convert_document_to_text.py with ask_question_about_documents.py")
max_tokens = 1024
max_chars = max_tokens * 4
if len(output_text) > max_chars:
print(f"Head of the text (MUST use file {args.output} for full text):")
print(output_text[:max_chars])
else:
print(output_text)
else:
print("Failed to convert files or URLs to text")
return output_text
if __name__ == "__main__":
main()
"""
Examples:
wget https://aiindex.stanford.edu/wp-content/uploads/2024/04/HAI_2024_AI-Index-Report.pdf
python /home/jon/h2ogpt/openai_server/agent_tools/convert_document_to_text.py --urls http://www.cnn.com
python /home/jon/h2ogpt/openai_server/agent_tools/convert_document_to_text.py --files HAI_2024_AI-Index-Report.pdf
python /home/jon/h2ogpt/openai_server/agent_tools/convert_document_to_text.py --urls https://aiindex.stanford.edu/wp-content/uploads/2024/04/HAI_2024_AI-Index-Report.pdf
"""