001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.metadata;
016    
017    import com.liferay.portal.kernel.exception.SystemException;
018    import com.liferay.portal.kernel.io.DummyWriter;
019    import com.liferay.portal.kernel.log.Log;
020    import com.liferay.portal.kernel.log.LogFactoryUtil;
021    import com.liferay.portal.kernel.util.StreamUtil;
022    
023    import java.io.File;
024    import java.io.FileInputStream;
025    import java.io.IOException;
026    import java.io.InputStream;
027    
028    import org.apache.commons.lang.exception.ExceptionUtils;
029    import org.apache.pdfbox.exceptions.CryptographyException;
030    import org.apache.poi.EncryptedDocumentException;
031    import org.apache.tika.metadata.Metadata;
032    import org.apache.tika.parser.ParseContext;
033    import org.apache.tika.parser.Parser;
034    import org.apache.tika.sax.WriteOutContentHandler;
035    
036    import org.xml.sax.ContentHandler;
037    
038    /**
039     * @author Miguel Pastor
040     * @author Alexander Chow
041     * @author Shuyang Zhou
042     */
043    public class TikaRawMetadataProcessor extends XugglerRawMetadataProcessor {
044    
045            public void setParser(Parser parser) {
046                    _parser = parser;
047            }
048    
049            protected Metadata extractMetadata(
050                            InputStream inputStream, Metadata metadata)
051                    throws IOException {
052    
053                    if (metadata == null) {
054                            metadata = new Metadata();
055                    }
056    
057                    ParseContext parserContext = new ParseContext();
058    
059                    parserContext.set(Parser.class, _parser);
060    
061                    ContentHandler contentHandler = new WriteOutContentHandler(
062                            new DummyWriter());
063    
064                    try {
065                            _parser.parse(inputStream, contentHandler, metadata, parserContext);
066                    }
067                    catch (Exception e) {
068                            Throwable throwable = ExceptionUtils.getRootCause(e);
069    
070                            if ((throwable instanceof CryptographyException) ||
071                                    (throwable instanceof EncryptedDocumentException)) {
072    
073                                    if (_log.isWarnEnabled()) {
074                                            _log.warn(
075                                                    "Unable to extract metadata from an encrypted file");
076                                    }
077                            }
078                            else {
079                                    _log.error(e, e);
080                            }
081    
082                            throw new IOException(e.getMessage());
083                    }
084    
085                    // Remove potential security risks
086    
087                    metadata.remove(XMPDM.ABS_PEAK_AUDIO_FILE_PATH.getName());
088                    metadata.remove(XMPDM.RELATIVE_PEAK_AUDIO_FILE_PATH.getName());
089    
090                    return metadata;
091            }
092    
093            @Override
094            protected Metadata extractMetadata(
095                            String extension, String mimeType, File file)
096                    throws SystemException {
097    
098                    Metadata metadata = super.extractMetadata(extension, mimeType, file);
099    
100                    InputStream inputStream = null;
101    
102                    try {
103                            inputStream = new FileInputStream(file);
104    
105                            return extractMetadata(inputStream, metadata);
106                    }
107                    catch (IOException ioe) {
108                            throw new SystemException(ioe);
109                    }
110                    finally {
111                            StreamUtil.cleanUp(inputStream);
112                    }
113            }
114    
115            @Override
116            protected Metadata extractMetadata(
117                            String extension, String mimeType, InputStream inputStream)
118                    throws SystemException {
119    
120                    Metadata metadata = super.extractMetadata(
121                            extension, mimeType, inputStream);
122    
123                    try {
124                            return extractMetadata(inputStream, metadata);
125                    }
126                    catch (IOException ioe) {
127                            throw new SystemException(ioe);
128                    }
129            }
130    
131            private static Log _log = LogFactoryUtil.getLog(
132                    TikaRawMetadataProcessor.class);
133    
134            private Parser _parser;
135    
136    }