001/* Copyright (C) 2014 konik.io 002 * 003 * This file is part of the Konik library. 004 * 005 * The Konik library is free software: you can redistribute it and/or modify 006 * it under the terms of the GNU Affero General Public License as 007 * published by the Free Software Foundation, either version 3 of the 008 * License, or (at your option) any later version. 009 * 010 * The Konik library is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU Affero General Public License for more details. 014 * 015 * You should have received a copy of the GNU Affero General Public License 016 * along with the Konik library. If not, see <http://www.gnu.org/licenses/>. 017 */ 018package io.konik.carriage.itext; 019 020import static com.itextpdf.text.pdf.PdfName.EF; 021import static com.itextpdf.text.pdf.PdfName.F; 022import static com.itextpdf.text.pdf.PdfReader.getStreamBytes; 023import io.konik.harness.FileExtractor; 024import io.konik.harness.exception.InvoiceExtractionError; 025 026import java.io.ByteArrayInputStream; 027import java.io.IOException; 028import java.io.InputStream; 029 030import javax.inject.Named; 031import javax.inject.Singleton; 032 033import com.itextpdf.text.pdf.PRStream; 034import com.itextpdf.text.pdf.PdfArray; 035import com.itextpdf.text.pdf.PdfDictionary; 036import com.itextpdf.text.pdf.PdfName; 037import com.itextpdf.text.pdf.PdfReader; 038import com.itextpdf.text.pdf.PdfStream; 039 040/** 041 * The Class iText Pdf Invoice Extractor. 042 */ 043@Named 044@Singleton 045public class ITextInvoiceExtractor implements FileExtractor { 046 047 private static final PdfName AF = new PdfName("AF"); 048 049 @Override 050 public byte[] extract(InputStream pdfInput) { 051 052 PdfReader reader = getPdfReader(pdfInput); 053 PdfArray af = getValidAf(reader.getCatalog()); 054 PdfDictionary fileSpec = getValidFileSpec(af); 055 PdfDictionary ef = getValidEf(fileSpec); 056 return getFStream(ef); 057 } 058 059 @Override 060 public InputStream extractToStream(InputStream pdfInput) { 061 return new ByteArrayInputStream(extract(pdfInput)); 062 } 063 064 065 private static PdfReader getPdfReader(InputStream pdfStream) { 066 try { 067 return new PdfReader(pdfStream); 068 } catch (IOException e) { 069 throw new InvoiceExtractionError("Could not read or open pdf.",e); 070 } 071 } 072 073 private static PdfArray getValidAf(PdfDictionary catalog) { 074 if (catalog.contains(AF)) { 075 PdfArray af = catalog.getAsArray(AF); 076 if (!af.isEmpty() && af.getDirectObject(0).isDictionary()) { 077 return af; 078 } 079 } 080 throw new InvoiceExtractionError("Pdf catalog does not contain Valid AF Entry"); 081 } 082 083 private static PdfDictionary getValidFileSpec(PdfArray af) { 084 if (af.isEmpty() || af.getAsDict(0) == null) { 085 throw new InvoiceExtractionError("Pdf does not contain a FileSpec Entry"); 086 } 087 return af.getAsDict(0); 088 } 089 090 private static PdfDictionary getValidEf(PdfDictionary fileSpec) { 091 if (fileSpec.contains(EF)) { 092 return fileSpec.getAsDict(EF); 093 } 094 throw new InvoiceExtractionError("Pdf catalog does not contain Valid EF Entry"); 095 } 096 097 private static byte[] getFStream(PdfDictionary ef){ 098 if (ef.contains(F)) { 099 PdfStream xmlStream = ef.getAsStream(F); 100 try { 101 return getStreamBytes((PRStream) xmlStream); 102 } catch (IOException e) { 103 throw new InvoiceExtractionError("Could not extrac xml content form pdf.",e); 104 } 105 } 106 throw new InvoiceExtractionError("Pdf catalog does not contain Valid F Entry"); 107 } 108 109 110}