001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.metadata;
016    
017    import com.liferay.portal.kernel.exception.SystemException;
018    import com.liferay.portal.kernel.io.DummyWriter;
019    import com.liferay.portal.kernel.log.Log;
020    import com.liferay.portal.kernel.log.LogFactoryUtil;
021    import com.liferay.portal.kernel.process.ClassPathUtil;
022    import com.liferay.portal.kernel.process.ProcessCallable;
023    import com.liferay.portal.kernel.process.ProcessException;
024    import com.liferay.portal.kernel.process.ProcessExecutor;
025    import com.liferay.portal.kernel.util.ArrayUtil;
026    import com.liferay.portal.kernel.util.FileUtil;
027    import com.liferay.portal.kernel.util.StreamUtil;
028    import com.liferay.portal.util.PropsValues;
029    
030    import java.io.File;
031    import java.io.FileInputStream;
032    import java.io.IOException;
033    import java.io.InputStream;
034    
035    import java.util.concurrent.Future;
036    
037    import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
038    import org.apache.commons.lang.exception.ExceptionUtils;
039    import org.apache.pdfbox.exceptions.CryptographyException;
040    import org.apache.poi.EncryptedDocumentException;
041    import org.apache.tika.exception.TikaException;
042    import org.apache.tika.metadata.Metadata;
043    import org.apache.tika.parser.ParseContext;
044    import org.apache.tika.parser.Parser;
045    import org.apache.tika.sax.WriteOutContentHandler;
046    
047    import org.xml.sax.ContentHandler;
048    
049    /**
050     * @author Miguel Pastor
051     * @author Alexander Chow
052     * @author Shuyang Zhou
053     */
054    public class TikaRawMetadataProcessor extends XugglerRawMetadataProcessor {
055    
056            public void setParser(Parser parser) {
057                    _parser = parser;
058            }
059    
060            protected static Metadata extractMetadata(
061                            InputStream inputStream, Metadata metadata, Parser parser)
062                    throws IOException {
063    
064                    if (metadata == null) {
065                            metadata = new Metadata();
066                    }
067    
068                    ParseContext parserContext = new ParseContext();
069    
070                    parserContext.set(Parser.class, parser);
071    
072                    ContentHandler contentHandler = new WriteOutContentHandler(
073                            new DummyWriter());
074    
075                    try {
076                            parser.parse(inputStream, contentHandler, metadata, parserContext);
077                    }
078                    catch (Exception e) {
079                            Throwable throwable = ExceptionUtils.getRootCause(e);
080    
081                            if ((throwable instanceof CryptographyException) ||
082                                    (throwable instanceof EncryptedDocumentException) ||
083                                    (throwable instanceof UnsupportedZipFeatureException)) {
084    
085                                    if (_log.isWarnEnabled()) {
086                                            _log.warn(
087                                                    "Unable to extract metadata from an encrypted file");
088                                    }
089                            }
090                            else if (e instanceof TikaException) {
091                                    if (_log.isWarnEnabled()) {
092                                            _log.warn("Unable to extract metadata");
093                                    }
094                            }
095                            else {
096                                    _log.error(e, e);
097                            }
098    
099                            throw new IOException(e.getMessage());
100                    }
101    
102                    // Remove potential security risks
103    
104                    metadata.remove(XMPDM.ABS_PEAK_AUDIO_FILE_PATH.getName());
105                    metadata.remove(XMPDM.RELATIVE_PEAK_AUDIO_FILE_PATH.getName());
106    
107                    return metadata;
108            }
109    
110            @Override
111            protected Metadata extractMetadata(
112                            String extension, String mimeType, File file)
113                    throws SystemException {
114    
115                    Metadata metadata = super.extractMetadata(extension, mimeType, file);
116    
117                    boolean forkProcess = false;
118    
119                    if (PropsValues.TEXT_EXTRACTION_FORK_PROCESS_ENABLED) {
120                            if (ArrayUtil.contains(
121                                            PropsValues.TEXT_EXTRACTION_FORK_PROCESS_MIME_TYPES,
122                                            mimeType)) {
123    
124                                    forkProcess = true;
125                            }
126                    }
127    
128                    if (forkProcess) {
129                            ExtractMetadataProcessCallable extractMetadataProcessCallable =
130                                    new ExtractMetadataProcessCallable(file, metadata, _parser);
131    
132                            try {
133                                    Future<Metadata> future = ProcessExecutor.execute(
134                                            ClassPathUtil.getPortalClassPath(),
135                                            extractMetadataProcessCallable);
136    
137                                    return future.get();
138                            }
139                            catch (Exception e) {
140                                    throw new SystemException(e);
141                            }
142                    }
143    
144                    InputStream inputStream = null;
145    
146                    try {
147                            inputStream = new FileInputStream(file);
148    
149                            return extractMetadata(inputStream, metadata, _parser);
150                    }
151                    catch (IOException ioe) {
152                            throw new SystemException(ioe);
153                    }
154                    finally {
155                            StreamUtil.cleanUp(inputStream);
156                    }
157            }
158    
159            @Override
160            protected Metadata extractMetadata(
161                            String extension, String mimeType, InputStream inputStream)
162                    throws SystemException {
163    
164                    Metadata metadata = super.extractMetadata(
165                            extension, mimeType, inputStream);
166    
167                    boolean forkProcess = false;
168    
169                    if (PropsValues.TEXT_EXTRACTION_FORK_PROCESS_ENABLED) {
170                            if (ArrayUtil.contains(
171                                            PropsValues.TEXT_EXTRACTION_FORK_PROCESS_MIME_TYPES,
172                                            mimeType)) {
173    
174                                    forkProcess = true;
175                            }
176                    }
177    
178                    if (forkProcess) {
179                            File file = FileUtil.createTempFile();
180    
181                            try {
182                                    FileUtil.write(file, inputStream);
183    
184                                    ExtractMetadataProcessCallable extractMetadataProcessCallable =
185                                            new ExtractMetadataProcessCallable(file, metadata, _parser);
186    
187                                    Future<Metadata> future = ProcessExecutor.execute(
188                                            ClassPathUtil.getPortalClassPath(),
189                                            extractMetadataProcessCallable);
190    
191                                    return future.get();
192                            }
193                            catch (Exception e) {
194                                    throw new SystemException(e);
195                            }
196                            finally {
197                                    file.delete();
198                            }
199                    }
200    
201                    try {
202                            return extractMetadata(inputStream, metadata, _parser);
203                    }
204                    catch (IOException ioe) {
205                            throw new SystemException(ioe);
206                    }
207            }
208    
209            private static Log _log = LogFactoryUtil.getLog(
210                    TikaRawMetadataProcessor.class);
211    
212            private Parser _parser;
213    
214            private static class ExtractMetadataProcessCallable
215                    implements ProcessCallable<Metadata> {
216    
217                    public ExtractMetadataProcessCallable(
218                            File file, Metadata metadata, Parser parser) {
219    
220                            _file = file;
221                            _metadata = metadata;
222                            _parser = parser;
223                    }
224    
225                    @Override
226                    public Metadata call() throws ProcessException {
227                            InputStream inputStream = null;
228    
229                            try {
230                                    inputStream = new FileInputStream(_file);
231    
232                                    return extractMetadata(inputStream, _metadata, _parser);
233                            }
234                            catch (IOException ioe) {
235                                    throw new ProcessException(ioe);
236                            }
237                            finally {
238                                    if (inputStream != null) {
239                                            try {
240                                                    inputStream.close();
241                                            }
242                                            catch (IOException ioe) {
243                                                    throw new ProcessException(ioe);
244                                            }
245                                    }
246                            }
247                    }
248    
249                    private static final long serialVersionUID = 1L;
250    
251                    private File _file;
252                    private Metadata _metadata;
253                    private Parser _parser;
254    
255            }
256    
257    }