//--------------------------------------------------------------------------------------- // Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved. // Consult legal.txt regarding legal and license information. //--------------------------------------------------------------------------------------- // !Warning! This file is autogenerated, modify the .codegen file, not this one // (any changes here will be wiped out during the autogen process) #ifndef PDFTRON_H_CPPPDFDataExtractionModule #define PDFTRON_H_CPPPDFDataExtractionModule #include #include #include #include #include namespace pdftron { namespace PDF { /** * The class DataExtractionModule. * static interface to Apryse SDKs data extraction functionality */ class DataExtractionModule { public: /** * Data Extraction Engines. */ enum DataExtractionEngine { e_Tabular = 0, ///< Tabular Data engine. This engine identifies column and row structure and analyzes numeric columns. It is especially suited to documents that are table-based such as spreadsheets. e_Form = 1, ///< Form field extraction engine. This engine uses artificial intelligence and computer vision to detect form fields in documents that do not have any interactive field annotations embedded. e_DocStructure = 2, ///< Document structure engine. This engine discovers the full logical structure, including headers, footers, paragraphs, list items, table columns, cells, borders, images and graphics. e_FormKeyValue = 3, ///< Form field with key value extraction engine. This engine uses artificial intelligence and computer vision to detect form fields, including field name and values, in documents that do not have any interactive field annotations embedded. e_GenericKeyValue = 4 ///< Generic key value extraction engine. This engine uses artificial intelligence to detect arbitrary pairs of key and value in documents. Note: This engine is experimental and subject to change. }; /** * Find out whether the specified data extraction engine is available * (and licensed). * * @param engine The extraction engine. * @return Returns true if data extraction operations can be performed. */ static bool IsModuleAvailable(DataExtractionEngine engine); /** * Perform data extraction on a PDF file using the specified engine and return the resulting JSON string. * Note: The FormKeyValue engine is experimental and subject to change. * * @param input_pdf_file -- The source document filename. * @param engine -- The extraction engine. * @param options -- Data extraction options (optional). * @return JSON string representing the extracted results. */ static UString ExtractData(const UString& input_pdf_file, DataExtractionEngine engine, DataExtractionOptions* options = 0); /** * Perform data extraction on a PDF file using the specified engine. * Note: The FormKeyValue engine is experimental and subject to change. * * @param input_pdf_file -- The source document filename. * @param output_json_file -- The resulting JSON filename. * @param engine -- The extraction engine. * @param options -- Data extraction options (optional). */ static void ExtractData(const UString& input_pdf_file, const UString& output_json_file, DataExtractionEngine engine, DataExtractionOptions* options = 0); /** * Perform automatic form field detection, then insert the fields into the PDF. * Note: The FormKeyValue engine is experimental and subject to change. * * @param doc -- The PDF document where fields are detected from and inserted into. * @param options -- Data extraction options (optional). */ static void DetectAndAddFormFieldsToPDF(PDFDoc& doc, DataExtractionOptions* options = 0); /** * Perform data extraction on a PDF in XLSX output format. * * @param input_pdf_file -- The source document filename. * @param output_xlsx_file -- The resulting XLSX filename. * @param options -- Data extraction options (optional). */ static void ExtractToXLSX(const UString& input_pdf_file, const UString& output_xlsx_file, DataExtractionOptions* options = 0); /** * Perform data extraction on a PDF in XLSX output format. * * @param input_pdf_file -- The source document filename. * @param output_xlsx_stream -- The resulting XLSX filter. * @param options -- Data extraction options (optional). */ static void ExtractToXLSX(const UString& input_pdf_file, Filters::Filter& output_xlsx_stream, DataExtractionOptions* options = 0); }; #include } //end PDF } //end pdftron #endif //PDFTRON_H_CPPPDFDataExtractionModule