001/* Copyright (C) 2014 konik.io
002 *
003 * This file is part of the Konik library.
004 *
005 * The Konik library is free software: you can redistribute it and/or modify
006 * it under the terms of the GNU Affero General Public License as
007 * published by the Free Software Foundation, either version 3 of the
008 * License, or (at your option) any later version.
009 *
010 * The Konik library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013 * GNU Affero General Public License for more details.
014 *
015 * You should have received a copy of the GNU Affero General Public License
016 * along with the Konik library. If not, see <http://www.gnu.org/licenses/>.
017 */
018package io.konik.carriage.pdfbox;
019
020import io.konik.carriage.utils.CallBackInputStream;
021import io.konik.harness.FileExtractor;
022import io.konik.harness.exception.InvoiceExtractionError;
023
024import java.io.IOException;
025import java.io.InputStream;
026
027import javax.inject.Named;
028import javax.inject.Singleton;
029
030import org.apache.pdfbox.io.IOUtils;
031import org.apache.pdfbox.pdmodel.PDDocument;
032import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
033import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
034import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
035
036/**
037 * The PDFBoxInvoice Extractor.
038 */
039@Named
040@Singleton
041public class PDFBoxInvoiceExtractor implements FileExtractor {
042
043   
044   static final String NO_EMBEDDED_FILES = "The PDF does not contain any attached files";
045   static final String NO_ZF_FILE = "The PDF does not contain an attached file named ZUGFeRD-invoice.xml";
046   static final String ZF_FILE_NAME = "ZUGFeRD-invoice.xml";
047
048   @Override
049   public byte[] extract(InputStream pdfInput) {
050      InputStream attachmentFile = null;
051      try {
052         attachmentFile = extractToStream(pdfInput);
053         return IOUtils.toByteArray(attachmentFile);
054      } catch (IOException e) {
055         throw new InvoiceExtractionError("Error extracting content from PDF",e);
056      }finally {
057         IOUtils.closeQuietly(attachmentFile);
058      }
059   }
060   
061   @Override
062   public InputStream extractToStream(InputStream pdfInput) {
063      try {
064         return extractIntern(pdfInput);
065      } catch (IOException e) {
066         throw new InvoiceExtractionError("Error extracting content from PDF",e);
067      }
068   }
069   
070   private static final InputStream extractIntern(InputStream pdfStream) throws IOException {
071      PDDocument doc = PDDocument.load(pdfStream);
072      InputStream inputStream = extractZugferdFileAttachment(doc);
073      return new CallBackInputStream(inputStream, doc);
074   }
075   
076   private static final InputStream extractZugferdFileAttachment(PDDocument doc) throws IOException {
077      PDDocumentNameDictionary nameDictionary = new PDDocumentNameDictionary(doc.getDocumentCatalog());
078      PDEmbeddedFilesNameTreeNode embeddedFiles = listEmbeddedFiles(nameDictionary);
079      return extractZugferdXmlAttachment(embeddedFiles);
080   }
081
082   private static final PDEmbeddedFilesNameTreeNode listEmbeddedFiles(PDDocumentNameDictionary names) {
083      PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
084      if (embeddedFiles == null) { throw new InvoiceExtractionError(NO_EMBEDDED_FILES); }
085      return embeddedFiles;
086   }
087   
088   private static final InputStream extractZugferdXmlAttachment(PDEmbeddedFilesNameTreeNode embeddedFiles)
089         throws IOException {
090      PDComplexFileSpecification fileSpec = embeddedFiles.getValue(ZF_FILE_NAME);
091      if (fileSpec == null) { throw new InvoiceExtractionError(NO_ZF_FILE); }
092      return fileSpec.getEmbeddedFile().createInputStream();
093   }
094
095
096}