File size: 9,599 Bytes
3943768 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 |
import argparse
import sys
import uuid
if 'src' not in sys.path:
sys.path.append('src')
from src.function_client import get_data_h2ogpt
def has_gpu():
import subprocess
try:
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return result.returncode == 0
except FileNotFoundError:
return False
def pdf_has_images(pdf_path):
import fitz
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images()
if image_list:
# print(f"Page {page_num + 1} contains {len(image_list)} image(s)")
return True
# print("No images found in the PDF")
return False
def get_num_pages(file):
try:
import fitz
src = fitz.open(file)
return len(src)
except:
return None
def convert_to_csv(file):
import pandas as pd
# read the xls or xlsx file
if file.lower().endswith('.xls') or file.lower().endswith('.xlsx'):
df = pd.read_excel(file)
new_file = file.replace('.xls', '.csv').replace('.xlsx', '.csv')
try:
df.to_csv(new_file, index=False)
print(f"Converted {file} to CSV for data analysis as {new_file}")
except Exception as e:
pass
def sources_to_text(sources1):
each_content1 = []
all_content1 = ''
for source in sources1:
meta_str = ''
meta = source.metadata
if 'source' in meta:
meta_str += f"Source: {meta['source']}\n"
if 'parser' in meta:
meta_str += f"Parser: {meta['parser']}\n"
if 'title' in meta:
meta_str += f"Title: {meta['title']}\n"
if 'page' in meta:
meta_str += f"Page: {meta['page']}\n"
content1 = f"""\n<document>\n{meta_str}\n<text>\n{source.page_content}\n</text>\n</document>\n"""
each_content1.append(content1)
all_content1 += content1
return all_content1, each_content1
def process_files(files, urls):
text_context_list = []
succeeded = []
textual_types = ('.txt', '.csv', '.toml', '.py', '.rst', '.rtf', '.md', '.html', '.htm', '.xml', '.json', '.yaml',
'.yml', '.ini', '.log', '.tex', '.sql', '.sh', '.bat', '.js', '.css', '.php', '.jsp', '.pl', '.r',
'.lua', '.conf', '.properties', '.tsv', '.xhtml', '.srt', '.vtt', '.cpp', '.c', '.h', '.go')
doc_types = ('.pdf', '.docx', '.doc', '.epub', '.pptx', '.ppt', '.xls', '.xlsx')
from openai_server.agent_tools.common.utils import filename_is_url
files_new = []
urls_new = []
for filename in files + urls:
if filename in files:
if filename_is_url(filename):
urls_new.append(filename)
else:
files_new.append(filename)
else:
urls_new.append(filename)
files = files_new
urls = urls_new
from openai_server.agent_tools.common.utils import download_simple
for filename in files + urls:
enable_transcriptions = False
enable_llava = False
if filename.lower().endswith('.pdf'):
if filename in urls:
newfile = download_simple(filename)
num_pages = get_num_pages(newfile)
has_images = pdf_has_images(newfile)
else:
num_pages = get_num_pages(filename)
has_images = pdf_has_images(filename)
if num_pages and num_pages < 20:
if has_images:
enable_pdf_doctr = 'on'
use_pypdf = 'off'
else:
enable_pdf_doctr = 'off'
use_pypdf = 'on'
use_pymupdf = 'off'
else:
enable_pdf_doctr = 'off'
use_pymupdf = 'on'
use_pypdf = 'off'
else:
# non-pdf, allow docTR in case, e.g. video
enable_pdf_doctr = 'on'
use_pymupdf = 'on'
use_pypdf = 'off'
enable_transcriptions = True
enable_llava = True
if filename.lower().endswith('.xls') or filename.lower().endswith('.xlsx'):
if filename in urls:
xls_file = download_simple(filename)
else:
xls_file = filename
convert_to_csv(xls_file)
sources1, known_type = get_data_h2ogpt(filename,
is_url=filename in urls,
verbose=False,
use_pymupdf=use_pymupdf,
use_pypdf=use_pypdf,
use_unstructured_pdf='off',
enable_pdf_ocr='off',
enable_pdf_doctr=enable_pdf_doctr,
try_pdf_as_html='off',
enable_captions=False, # no need if llava used
enable_llava=enable_llava,
chunk=False,
enable_transcriptions=enable_transcriptions,
)
all_content1, each_content1 = sources_to_text(sources1)
if filename.lower().endswith('.pdf') and enable_pdf_doctr == 'off':
if use_pymupdf == 'on':
use_pymupdf = 'off'
use_pypdf = 'on'
else:
use_pymupdf = 'on'
use_pypdf = 'off'
sources2, known_type = get_data_h2ogpt(filename,
is_url=filename in urls,
verbose=False,
use_pymupdf=use_pymupdf,
use_pypdf=use_pypdf,
use_unstructured_pdf='off',
enable_pdf_ocr='off',
enable_pdf_doctr=enable_pdf_doctr,
try_pdf_as_html='off',
enable_captions=False,
enable_llava=False,
chunk=False,
enable_transcriptions=False,
)
all_content2, each_content2 = sources_to_text(sources2)
# choose one with more content in case pymupdf fails to find info
if len(all_content2) > len(all_content1):
each_content1 = each_content2
if not sources1:
succeeded.append(False)
print(f"Unable to handle file type for {filename}")
else:
succeeded.append(True)
text_context_list.extend(each_content1)
return text_context_list, any(succeeded)
def get_text(files, urls):
text_context_list, any_succeeded = process_files(files, urls)
# Join the text_context_list into a single string
if any_succeeded:
output_text = "\n\n".join(text_context_list)
else:
output_text = None
return output_text
def main():
parser = argparse.ArgumentParser(description="Converts document to text")
parser.add_argument("--files", nargs="+", required=False, help="Files to convert to text")
parser.add_argument("--urls", nargs="+", required=False, help="URLs to convert to text")
parser.add_argument("--output", type=str, required=False, help="Output filename")
args = parser.parse_args()
if not args.output:
args.output = f"conversion_to_text_{str(uuid.uuid4())[:6]}.txt"
files = args.files or []
urls = args.urls or []
output_text = get_text(files, urls)
# Write the output to the specified file
if output_text is not None:
with open(args.output, "w") as f:
f.write(output_text)
print(f"{files + urls} have been converted to text and written to {args.output}")
print(
"The output may be complex for input of PDFs or URLs etc., so do not assume the structure of the output file and instead check it directly.")
print("Probably a verify any use of convert_document_to_text.py with ask_question_about_documents.py")
max_tokens = 1024
max_chars = max_tokens * 4
if len(output_text) > max_chars:
print(f"Head of the text (MUST use file {args.output} for full text):")
print(output_text[:max_chars])
else:
print(output_text)
else:
print("Failed to convert files or URLs to text")
return output_text
if __name__ == "__main__":
main()
"""
Examples:
wget https://aiindex.stanford.edu/wp-content/uploads/2024/04/HAI_2024_AI-Index-Report.pdf
python /home/jon/h2ogpt/openai_server/agent_tools/convert_document_to_text.py --urls http://www.cnn.com
python /home/jon/h2ogpt/openai_server/agent_tools/convert_document_to_text.py --files HAI_2024_AI-Index-Report.pdf
python /home/jon/h2ogpt/openai_server/agent_tools/convert_document_to_text.py --urls https://aiindex.stanford.edu/wp-content/uploads/2024/04/HAI_2024_AI-Index-Report.pdf
"""
|