Spaces:
Sleeping
Sleeping
//======================================================================== | |
// | |
// pdftohtml.cc | |
// | |
// | |
// Copyright 1999-2000 G. Ovtcharov | |
//======================================================================== | |
//======================================================================== | |
// | |
// Modified under the Poppler project - http://poppler.freedesktop.org | |
// | |
// All changes made under the Poppler project to this file are licensed | |
// under GPL version 2 or later | |
// | |
// Copyright (C) 2007-2008, 2010, 2012, 2015-2020, 2022 Albert Astals Cid <[email protected]> | |
// Copyright (C) 2010 Hib Eris <[email protected]> | |
// Copyright (C) 2010 Mike Slegeir <[email protected]> | |
// Copyright (C) 2010, 2013 Suzuki Toshiya <[email protected]> | |
// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey ([email protected]) and Onkar Potdar ([email protected]) | |
// Copyright (C) 2011 Steven Murdoch <[email protected]> | |
// Copyright (C) 2012 Igor Slepchin <[email protected]> | |
// Copyright (C) 2012 Ihar Filipau <[email protected]> | |
// Copyright (C) 2012 Luis Parravicini <[email protected]> | |
// Copyright (C) 2014 Pino Toscano <[email protected]> | |
// Copyright (C) 2015 William Bader <[email protected]> | |
// Copyright (C) 2017, 2021 Adrian Johnson <[email protected]> | |
// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <[email protected]>. Work sponsored by the LiMux project of the city of Munich | |
// Copyright (C) 2018 Thibaut Brard <[email protected]> | |
// Copyright (C) 2018 Adam Reichold <[email protected]> | |
// Copyright (C) 2019, 2021, 2024 Oliver Sander <[email protected]> | |
// Copyright (C) 2021 Hubert Figuiere <[email protected]> | |
// Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <[email protected]> | |
// | |
// To see a description of the changes please see the Changelog file that | |
// came with your tarball or type make ChangeLog if you are building from git | |
// | |
//======================================================================== | |
static int firstPage = 1; | |
static int lastPage = 0; | |
static bool rawOrder = true; | |
bool printCommands = true; | |
static bool printHelp = false; | |
bool printHtml = false; | |
bool complexMode = false; | |
bool singleHtml = false; // singleHtml | |
bool dataUrls = false; | |
bool ignore = false; | |
static char extension[5] = "png"; | |
static double scale = 1.5; | |
bool noframes = false; | |
bool stout = false; | |
bool xml = false; | |
bool noRoundedCoordinates = false; | |
static bool errQuiet = false; | |
static bool noDrm = false; | |
double wordBreakThreshold = 10; // 10%, below converted into a coefficient - 0.1 | |
bool showHidden = false; | |
bool noMerge = false; | |
bool fontFullName = false; | |
static char ownerPassword[33] = ""; | |
static char userPassword[33] = ""; | |
static bool printVersion = false; | |
static std::unique_ptr<GooString> getInfoString(Dict *infoDict, const char *key); | |
static GooString *getInfoDate(Dict *infoDict, const char *key); | |
static char textEncName[128] = ""; | |
static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to convert" }, | |
{ "-l", argInt, &lastPage, 0, "last page to convert" }, | |
/*{"-raw", argFlag, &rawOrder, 0, | |
"keep strings in content stream order"},*/ | |
{ "-q", argFlag, &errQuiet, 0, "don't print any messages or errors" }, | |
{ "-h", argFlag, &printHelp, 0, "print usage information" }, | |
{ "-?", argFlag, &printHelp, 0, "print usage information" }, | |
{ "-help", argFlag, &printHelp, 0, "print usage information" }, | |
{ "--help", argFlag, &printHelp, 0, "print usage information" }, | |
{ "-p", argFlag, &printHtml, 0, "exchange .pdf links by .html" }, | |
{ "-c", argFlag, &complexMode, 0, "generate complex document" }, | |
{ "-s", argFlag, &singleHtml, 0, "generate single document that includes all pages" }, | |
{ "-dataurls", argFlag, &dataUrls, 0, "use data URLs instead of external images in HTML" }, | |
{ "-i", argFlag, &ignore, 0, "ignore images" }, | |
{ "-noframes", argFlag, &noframes, 0, "generate no frames" }, | |
{ "-stdout", argFlag, &stout, 0, "use standard output" }, | |
{ "-zoom", argFP, &scale, 0, "zoom the pdf document (default 1.5)" }, | |
{ "-xml", argFlag, &xml, 0, "output for XML post-processing" }, | |
{ "-noroundcoord", argFlag, &noRoundedCoordinates, 0, "do not round coordinates (with XML output only)" }, | |
{ "-hidden", argFlag, &showHidden, 0, "output hidden text" }, | |
{ "-nomerge", argFlag, &noMerge, 0, "do not merge paragraphs" }, | |
{ "-enc", argString, textEncName, sizeof(textEncName), "output text encoding name" }, | |
{ "-fmt", argString, extension, sizeof(extension), "image file format for Splash output (png or jpg)" }, | |
{ "-v", argFlag, &printVersion, 0, "print copyright and version info" }, | |
{ "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" }, | |
{ "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" }, | |
{ "-nodrm", argFlag, &noDrm, 0, "override document DRM settings" }, | |
{ "-wbt", argFP, &wordBreakThreshold, 0, "word break threshold (default 10 percent)" }, | |
{ "-fontfullname", argFlag, &fontFullName, 0, "outputs font full name" }, | |
{} }; | |
class SplashOutputDevNoText : public SplashOutputDev | |
{ | |
public: | |
SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA, bool reverseVideoA, SplashColorPtr paperColorA, bool bitmapTopDownA = true) | |
: SplashOutputDev(colorModeA, bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA) { } | |
~SplashOutputDevNoText() override; | |
void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, const Unicode *u, int uLen) override { } | |
bool beginType3Char(GfxState *state, double x, double y, double dx, double dy, CharCode code, const Unicode *u, int uLen) override { return false; } | |
void endType3Char(GfxState *state) override { } | |
void beginTextObject(GfxState *state) override { } | |
void endTextObject(GfxState *state) override { } | |
bool interpretType3Chars() override { return false; } | |
}; | |
SplashOutputDevNoText::~SplashOutputDevNoText() = default; | |
int main(int argc, char *argv[]) | |
{ | |
std::unique_ptr<PDFDoc> doc; | |
GooString *fileName = nullptr; | |
std::unique_ptr<GooString> docTitle; | |
std::unique_ptr<GooString> author; | |
std::unique_ptr<GooString> keywords; | |
std::unique_ptr<GooString> subject; | |
GooString *date = nullptr; | |
GooString *htmlFileName = nullptr; | |
HtmlOutputDev *htmlOut = nullptr; | |
SplashOutputDev *splashOut = nullptr; | |
bool doOutline; | |
bool ok; | |
std::optional<GooString> ownerPW, userPW; | |
Object info; | |
int exit_status = EXIT_FAILURE; | |
Win32Console win32Console(&argc, &argv); | |
// parse args | |
ok = parseArgs(argDesc, &argc, argv); | |
if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) { | |
fprintf(stderr, "pdftohtml version %s\n", PACKAGE_VERSION); | |
fprintf(stderr, "%s\n", popplerCopyright); | |
fprintf(stderr, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch"); | |
fprintf(stderr, "%s\n\n", xpdfCopyright); | |
if (!printVersion) { | |
printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc); | |
} | |
exit(printHelp || printVersion ? 0 : 1); | |
} | |
// init error file | |
// errorInit(); | |
// read config file | |
globalParams = std::make_unique<GlobalParams>(); | |
if (errQuiet) { | |
globalParams->setErrQuiet(errQuiet); | |
printCommands = false; // I'm not 100% what is the difference between them | |
} | |
if (textEncName[0]) { | |
globalParams->setTextEncoding(textEncName); | |
if (!globalParams->getTextEncoding()) { | |
goto error; | |
} | |
} | |
// convert from user-friendly percents into a coefficient | |
wordBreakThreshold /= 100.0; | |
// open PDF file | |
if (ownerPassword[0]) { | |
ownerPW = GooString(ownerPassword); | |
} | |
if (userPassword[0]) { | |
userPW = GooString(userPassword); | |
} | |
fileName = new GooString(argv[1]); | |
if (fileName->cmp("-") == 0) { | |
delete fileName; | |
fileName = new GooString("fd://0"); | |
} | |
doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW); | |
if (!doc->isOk()) { | |
goto error; | |
} | |
// check for copy permission | |
if (!doc->okToCopy()) { | |
if (!noDrm) { | |
error(errNotAllowed, -1, "Copying of text from this document is not allowed."); | |
goto error; | |
} | |
fprintf(stderr, "Document has copy-protection bit set.\n"); | |
} | |
// construct text file name | |
if (argc == 3) { | |
GooString *tmp = new GooString(argv[2]); | |
if (!xml) { | |
if (tmp->getLength() >= 5) { | |
const char *p = tmp->c_str() + tmp->getLength() - 5; | |
if (!strcmp(p, ".html") || !strcmp(p, ".HTML")) { | |
htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 5); | |
} | |
} | |
} else { | |
if (tmp->getLength() >= 4) { | |
const char *p = tmp->c_str() + tmp->getLength() - 4; | |
if (!strcmp(p, ".xml") || !strcmp(p, ".XML")) { | |
htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 4); | |
} | |
} | |
} | |
if (!htmlFileName) { | |
htmlFileName = new GooString(tmp); | |
} | |
delete tmp; | |
} else if (fileName->cmp("fd://0") == 0) { | |
error(errCommandLine, -1, "You have to provide an output filename when reading from stdin."); | |
goto error; | |
} else { | |
const char *p = fileName->c_str() + fileName->getLength() - 4; | |
if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) { | |
htmlFileName = new GooString(fileName->c_str(), fileName->getLength() - 4); | |
} else { | |
htmlFileName = fileName->copy(); | |
} | |
// htmlFileName->append(".html"); | |
} | |
if (scale > 3.0) { | |
scale = 3.0; | |
} | |
if (scale < 0.5) { | |
scale = 0.5; | |
} | |
if (complexMode) { | |
// noframes=false; | |
stout = false; | |
} | |
if (stout) { | |
noframes = true; | |
complexMode = false; | |
} | |
if (xml) { | |
complexMode = true; | |
singleHtml = false; | |
noframes = true; | |
noMerge = true; | |
} | |
// get page range | |
if (firstPage < 1) { | |
firstPage = 1; | |
} | |
if (lastPage < 1 || lastPage > doc->getNumPages()) { | |
lastPage = doc->getNumPages(); | |
} | |
if (lastPage < firstPage) { | |
error(errCommandLine, -1, "Wrong page range given: the first page ({0:d}) can not be after the last page ({1:d}).", firstPage, lastPage); | |
goto error; | |
} | |
info = doc->getDocInfo(); | |
if (info.isDict()) { | |
docTitle = getInfoString(info.getDict(), "Title"); | |
author = getInfoString(info.getDict(), "Author"); | |
keywords = getInfoString(info.getDict(), "Keywords"); | |
subject = getInfoString(info.getDict(), "Subject"); | |
date = getInfoDate(info.getDict(), "ModDate"); | |
if (!date) { | |
date = getInfoDate(info.getDict(), "CreationDate"); | |
} | |
} | |
if (!docTitle) { | |
docTitle = std::make_unique<GooString>(htmlFileName); | |
} | |
if (!singleHtml) { | |
rawOrder = complexMode; // todo: figure out what exactly rawOrder do :) | |
} else { | |
rawOrder = singleHtml; | |
} | |
doOutline = doc->getOutline()->getItems() != nullptr; | |
// write text file | |
htmlOut = new HtmlOutputDev(doc->getCatalog(), htmlFileName->c_str(), docTitle->c_str(), author ? author->c_str() : nullptr, keywords ? keywords->c_str() : nullptr, subject ? subject->c_str() : nullptr, date ? date->c_str() : nullptr, | |
rawOrder, firstPage, doOutline); | |
if (date) { | |
delete date; | |
} | |
if ((complexMode || singleHtml) && !xml && !ignore) { | |
// White paper color | |
SplashColor color; | |
color[0] = color[1] = color[2] = 255; | |
// If the user specified "jpg" use JPEG, otherwise PNG | |
SplashImageFileFormat format = strcmp(extension, "jpg") ? splashFormatPng : splashFormatJpeg; | |
splashOut = new SplashOutputDevNoText(splashModeRGB8, 4, false, color); | |
splashOut->startDoc(doc.get()); | |
for (int pg = firstPage; pg <= lastPage; ++pg) { | |
InMemoryFile imf; | |
doc->displayPage(splashOut, pg, 72 * scale, 72 * scale, 0, true, false, false); | |
SplashBitmap *bitmap = splashOut->getBitmap(); | |
const std::unique_ptr<GooString> imgFileName = GooString::format("{0:s}{1:03d}.{2:s}", htmlFileName->c_str(), pg, extension); | |
auto f1 = dataUrls ? imf.open("wb") : fopen(imgFileName->c_str(), "wb"); | |
if (!f1) { | |
fprintf(stderr, "Could not open %s\n", imgFileName->c_str()); | |
continue; | |
} | |
bitmap->writeImgFile(format, f1, 72 * scale, 72 * scale); | |
fclose(f1); | |
if (dataUrls) { | |
htmlOut->addBackgroundImage(std::string((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64,") + gbase64Encode(imf.getBuffer())); | |
} else { | |
htmlOut->addBackgroundImage(gbasename(imgFileName->c_str())); | |
} | |
} | |
delete splashOut; | |
} | |
if (htmlOut->isOk()) { | |
doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0, true, false, false); | |
htmlOut->dumpDocOutline(doc.get()); | |
} | |
delete htmlOut; | |
exit_status = EXIT_SUCCESS; | |
// clean up | |
error: | |
delete fileName; | |
if (htmlFileName) { | |
delete htmlFileName; | |
} | |
return exit_status; | |
} | |
static std::unique_ptr<GooString> getInfoString(Dict *infoDict, const char *key) | |
{ | |
Object obj; | |
// Raw value as read from PDF (may be in pdfDocEncoding or UCS2) | |
const GooString *rawString; | |
// Value converted to unicode | |
Unicode *unicodeString; | |
int unicodeLength; | |
// Value HTML escaped and converted to desired encoding | |
std::unique_ptr<GooString> encodedString; | |
// Is rawString UCS2 (as opposed to pdfDocEncoding) | |
bool isUnicode; | |
obj = infoDict->lookup(key); | |
if (obj.isString()) { | |
rawString = obj.getString(); | |
// Convert rawString to unicode | |
if (hasUnicodeByteOrderMark(rawString->toStr())) { | |
isUnicode = true; | |
unicodeLength = (obj.getString()->getLength() - 2) / 2; | |
} else { | |
isUnicode = false; | |
unicodeLength = obj.getString()->getLength(); | |
} | |
unicodeString = new Unicode[unicodeLength]; | |
for (int i = 0; i < unicodeLength; i++) { | |
if (isUnicode) { | |
unicodeString[i] = ((rawString->getChar((i + 1) * 2) & 0xff) << 8) | (rawString->getChar(((i + 1) * 2) + 1) & 0xff); | |
} else { | |
unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff]; | |
} | |
} | |
// HTML escape and encode unicode | |
encodedString = HtmlFont::HtmlFilter(unicodeString, unicodeLength); | |
delete[] unicodeString; | |
} | |
return encodedString; | |
} | |
static GooString *getInfoDate(Dict *infoDict, const char *key) | |
{ | |
Object obj; | |
int year, mon, day, hour, min, sec, tz_hour, tz_minute; | |
char tz; | |
struct tm tmStruct; | |
GooString *result = nullptr; | |
char buf[256]; | |
obj = infoDict->lookup(key); | |
if (obj.isString()) { | |
const GooString *s = obj.getString(); | |
// TODO do something with the timezone info | |
if (parseDateString(s, &year, &mon, &day, &hour, &min, &sec, &tz, &tz_hour, &tz_minute)) { | |
tmStruct.tm_year = year - 1900; | |
tmStruct.tm_mon = mon - 1; | |
tmStruct.tm_mday = day; | |
tmStruct.tm_hour = hour; | |
tmStruct.tm_min = min; | |
tmStruct.tm_sec = sec; | |
tmStruct.tm_wday = -1; | |
tmStruct.tm_yday = -1; | |
tmStruct.tm_isdst = -1; | |
mktime(&tmStruct); // compute the tm_wday and tm_yday fields | |
if (strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct)) { | |
result = new GooString(buf); | |
} else { | |
result = new GooString(s); | |
} | |
} else { | |
result = new GooString(s); | |
} | |
} | |
return result; | |
} | |