001/* Copyright (C) 2014 konik.io 002 * 003 * This file is part of the Konik library. 004 * 005 * The Konik library is free software: you can redistribute it and/or modify 006 * it under the terms of the GNU Affero General Public License as 007 * published by the Free Software Foundation, either version 3 of the 008 * License, or (at your option) any later version. 009 * 010 * The Konik library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU Affero General Public License for more details. 014 * 015 * You should have received a copy of the GNU Affero General Public License 016 * along with the Konik library. If not, see <http://www.gnu.org/licenses/>. 017 */ 018package io.konik.carriage.pdfbox; 019 020import io.konik.carriage.utils.CallBackInputStream; 021import io.konik.harness.FileExtractor; 022import io.konik.harness.exception.InvoiceExtractionError; 023 024import java.io.IOException; 025import java.io.InputStream; 026 027import javax.inject.Named; 028import javax.inject.Singleton; 029 030import org.apache.pdfbox.io.IOUtils; 031import org.apache.pdfbox.pdmodel.PDDocument; 032import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; 033import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; 034import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; 035 036/** 037 * The PDFBoxInvoice Extractor. 038 */ 039@Named 040@Singleton 041public class PDFBoxInvoiceExtractor implements FileExtractor { 042 043 044 static final String NO_EMBEDDED_FILES = "The PDF does not contain any attached files"; 045 static final String NO_ZF_FILE = "The PDF does not contain an attached file named ZUGFeRD-invoice.xml"; 046 static final String ZF_FILE_NAME = "ZUGFeRD-invoice.xml"; 047 048 @Override 049 public byte[] extract(InputStream pdfInput) { 050 InputStream attachmentFile = null; 051 try { 052 attachmentFile = extractToStream(pdfInput); 053 return IOUtils.toByteArray(attachmentFile); 054 } catch (IOException e) { 055 throw new InvoiceExtractionError("Error extracting content from PDF",e); 056 }finally { 057 IOUtils.closeQuietly(attachmentFile); 058 } 059 } 060 061 @Override 062 public InputStream extractToStream(InputStream pdfInput) { 063 try { 064 return extractIntern(pdfInput); 065 } catch (IOException e) { 066 throw new InvoiceExtractionError("Error extracting content from PDF",e); 067 } 068 } 069 070 private static final InputStream extractIntern(InputStream pdfStream) throws IOException { 071 PDDocument doc = PDDocument.load(pdfStream); 072 InputStream inputStream = extractZugferdFileAttachment(doc); 073 return new CallBackInputStream(inputStream, doc); 074 } 075 076 private static final InputStream extractZugferdFileAttachment(PDDocument doc) throws IOException { 077 PDDocumentNameDictionary nameDictionary = new PDDocumentNameDictionary(doc.getDocumentCatalog()); 078 PDEmbeddedFilesNameTreeNode embeddedFiles = listEmbeddedFiles(nameDictionary); 079 return extractZugferdXmlAttachment(embeddedFiles); 080 } 081 082 private static final PDEmbeddedFilesNameTreeNode listEmbeddedFiles(PDDocumentNameDictionary names) { 083 PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); 084 if (embeddedFiles == null) { throw new InvoiceExtractionError(NO_EMBEDDED_FILES); } 085 return embeddedFiles; 086 } 087 088 private static final InputStream extractZugferdXmlAttachment(PDEmbeddedFilesNameTreeNode embeddedFiles) 089 throws IOException { 090 PDComplexFileSpecification fileSpec = embeddedFiles.getValue(ZF_FILE_NAME); 091 if (fileSpec == null) { throw new InvoiceExtractionError(NO_ZF_FILE); } 092 return fileSpec.getEmbeddedFile().createInputStream(); 093 } 094 095 096}