Trying To Extract Embedded File Attachments From Existing PDF Using C# .NET And PDFBox 1.7.0

1.5k Views Asked by At

I am trying to extract embedded file attachments from an existing PDF using C# .NET and PDFBox. The following is my code:

using System.Collections.Generic;
using System.IO;
using java.util;                                            // IKVM Java for Microsoft .NET  http://www.ikvm.net  
using java.io;                                              // IKVM Java for Microsoft .NET  http://www.ikvm.net
using org.apache.pdfbox.pdmodel;                            // PDFBox 1.7.0 http://pdfbox.apache.org
using org.apache.pdfbox.pdmodel.common;                     // PDFBox 1.7.0 http://pdfbox.apache.org
using org.apache.pdfbox.pdmodel.common.filespecification;   // PDFBox 1.7.0 http://pdfbox.apache.org
using org.apache.pdfbox.cos;                                // PDFBox 1.7.0 http://pdfbox.apache.org

namespace PDFClass
{
    public class Class1
    {
        public Class1 ()
        {
        }

        public void ReadPDFAttachments (string existingFileNameFullPath)
        {
            PDEmbeddedFilesNameTreeNode efTree;
            PDComplexFileSpecification fs;
            FileStream stream;
            ByteArrayInputStream fakeFile;
            PDDocument pdfDocument = new PDDocument();
            PDEmbeddedFile ef;
            PDDocumentNameDictionary names;
            Map efMap = new HashMap();

            pdfDocument = PDDocument.load(existingFileNameFullPath);
            PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(pdfDocument.getDocumentCatalog());
            PDEmbeddedFilesNameTreeNode embeddedFiles = namesDictionary.getEmbeddedFiles(); // some bug is currently preventing this call from working! >:[

            if (embeddedFiles != null)
            {
                var aKids = embeddedFiles.getKids().toArray();
                List<PDNameTreeNode> kids = new List<PDNameTreeNode>();
                foreach (object oKid in aKids)
                {
                    kids.Add(oKid as PDNameTreeNode);
                }
                if (kids != null)
                {
                    foreach (PDNameTreeNode kid in kids)
                    {
                        PDComplexFileSpecification spec = (PDComplexFileSpecification)kid.getValue("ZUGFERD_XML_FILENAME");
                        PDEmbeddedFile file = spec.getEmbeddedFile();
                        fs = new PDComplexFileSpecification();

                        // Loop through each file for re-embedding
                        byte[] data = file.getByteArray();
                        int read = data.Length;
                        fakeFile = new ByteArrayInputStream(data);
                        ef = new PDEmbeddedFile(pdfDocument, fakeFile);
                        fs.setEmbeddedFile(ef);

                        efMap.put(kid.toString(), fs);
                        embeddedFiles.setNames(efMap);
                        names = new PDDocumentNameDictionary(pdfDocument.getDocumentCatalog());
                        ((COSDictionary)efTree.getCOSObject()).removeItem(COSName.LIMITS);  // Bug in PDFBox code requires we do this, or attachment will not embed. >:[
                        names.setEmbeddedFiles(embeddedFiles);
                        pdfDocument.getDocumentCatalog().setNames(names);
                        fs.getCOSDictionary().setString("Desc", kid.toString()); // adds a description to attachment in PDF attachment list
                    }
                }
            }
        }

    }
}

The variable embeddedFiles is always null. even though I put a break in the code and can see the PDF file clearly has the attachment in it.

Any assistance would be greatly appreciated!

0

There are 0 best solutions below