//--------------------------------------------------------------------------------------- // Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved. // Consult legal.txt regarding legal and license information. //--------------------------------------------------------------------------------------- #include #include #include #include #include #include "../../LicenseKey/CPP/LicenseKey.h" using namespace pdftron; using namespace PDF; using namespace std; //--------------------------------------------------------------------------------------- // This sample explores the structure and content of a tagged PDF document and dumps // the structure information to the console window. // // In tagged PDF documents StructTree acts as a central repository for information // related to a PDF document's logical structure. The tree consists of StructElement-s // and ContentItem-s which are leaf nodes of the structure tree. // // The sample can be extended to access and extract the marked-content elements such // as text and images. //--------------------------------------------------------------------------------------- void PrintIndent(int indent) { cout << '\n'; for (int i=0; i MCIDPageMap; typedef map MCIDDocMap; // Used in code snippet 3. void ProcessElements2(ElementReader& reader, MCIDPageMap& mcid_page_map) { Element element; while (element = reader.Next()) // Read page contents { // In this sample we process only text, but the code can be extended // to handle paths, images, or any other Element type. int mcid = element.GetStructMCID(); if (mcid>= 0 && element.GetType() == Element::e_text) { string val = element.GetTextString().ConvertToAscii(); MCIDPageMap::iterator itr = mcid_page_map.find(mcid); if (itr != mcid_page_map.end()) itr->second += val; else mcid_page_map.insert(MCIDPageMap::value_type(mcid, val)); } } } // Used in code snippet 3. void ProcessStructElement2(Struct::SElement element, MCIDDocMap& mcid_doc_map, int ident) { if (!element.IsValid()) { return; } // Print out the type and title info, if any. PrintIndent(ident); cout << "<" << element.GetType(); if (element.HasTitle()) { cout << " title=\""<< element.GetTitle() << "\""; } cout << ">"; int num = element.GetNumKids(); for (int i=0; isecond; MCIDPageMap::iterator itr2 = mcid_page_map.find(cont.GetMCID()); if (itr2 != mcid_page_map.end()) { cout << itr2->second; } } } } else { // the kid is another StructElement node. ProcessStructElement2(element.GetAsStructElem(i), mcid_doc_map, ident+1); } } PrintIndent(ident); cout << ""; } int main(int argc, char *argv[]) { int ret = 0; PDFNet::Initialize(LicenseKey); // Relative path to the folder containing test files. string input_path = "../../TestFiles/"; string output_path = "../../TestFiles/Output/"; try // Extract logical structure from a PDF document { PDFDoc doc((input_path + "tagged.pdf").c_str()); doc.InitSecurityHandler(); cout << "____________________________________________________________" << endl; cout << "Sample 1 - Traverse logical structure tree..." << endl; { Struct::STree tree = doc.GetStructTree(); if (tree.IsValid()) { cout << "Document has a StructTree root." << endl; for (int i=0; i r = mcid_doc_map.insert(MCIDDocMap::value_type(itr.Current().GetIndex(), MCIDPageMap())); MCIDPageMap& page_mcid_map = (r.first)->second; ProcessElements2(reader, page_mcid_map); reader.End(); } Struct::STree tree = doc.GetStructTree(); if (tree.IsValid()) { for (int i=0; i