001
014
015 package com.liferay.portal.metadata;
016
017 import com.liferay.portal.kernel.exception.SystemException;
018 import com.liferay.portal.kernel.io.DummyWriter;
019 import com.liferay.portal.kernel.log.Log;
020 import com.liferay.portal.kernel.log.LogFactoryUtil;
021 import com.liferay.portal.kernel.process.ClassPathUtil;
022 import com.liferay.portal.kernel.process.ProcessCallable;
023 import com.liferay.portal.kernel.process.ProcessException;
024 import com.liferay.portal.kernel.process.ProcessExecutor;
025 import com.liferay.portal.kernel.util.ArrayUtil;
026 import com.liferay.portal.kernel.util.FileUtil;
027 import com.liferay.portal.kernel.util.StreamUtil;
028 import com.liferay.portal.util.PropsValues;
029
030 import java.io.File;
031 import java.io.FileInputStream;
032 import java.io.IOException;
033 import java.io.InputStream;
034
035 import java.util.concurrent.Future;
036
037 import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
038 import org.apache.commons.lang.exception.ExceptionUtils;
039 import org.apache.pdfbox.exceptions.CryptographyException;
040 import org.apache.poi.EncryptedDocumentException;
041 import org.apache.tika.exception.TikaException;
042 import org.apache.tika.metadata.Metadata;
043 import org.apache.tika.parser.ParseContext;
044 import org.apache.tika.parser.Parser;
045 import org.apache.tika.sax.WriteOutContentHandler;
046
047 import org.xml.sax.ContentHandler;
048
049
054 public class TikaRawMetadataProcessor extends XugglerRawMetadataProcessor {
055
056 public void setParser(Parser parser) {
057 _parser = parser;
058 }
059
060 protected static Metadata extractMetadata(
061 InputStream inputStream, Metadata metadata, Parser parser)
062 throws IOException {
063
064 if (metadata == null) {
065 metadata = new Metadata();
066 }
067
068 ParseContext parserContext = new ParseContext();
069
070 parserContext.set(Parser.class, parser);
071
072 ContentHandler contentHandler = new WriteOutContentHandler(
073 new DummyWriter());
074
075 try {
076 parser.parse(inputStream, contentHandler, metadata, parserContext);
077 }
078 catch (Exception e) {
079 Throwable throwable = ExceptionUtils.getRootCause(e);
080
081 if ((throwable instanceof CryptographyException) ||
082 (throwable instanceof EncryptedDocumentException) ||
083 (throwable instanceof UnsupportedZipFeatureException)) {
084
085 if (_log.isWarnEnabled()) {
086 _log.warn(
087 "Unable to extract metadata from an encrypted file");
088 }
089 }
090 else if (e instanceof TikaException) {
091 if (_log.isWarnEnabled()) {
092 _log.warn("Unable to extract metadata");
093 }
094 }
095 else {
096 _log.error(e, e);
097 }
098
099 throw new IOException(e.getMessage());
100 }
101
102
103
104 metadata.remove(XMPDM.ABS_PEAK_AUDIO_FILE_PATH.getName());
105 metadata.remove(XMPDM.RELATIVE_PEAK_AUDIO_FILE_PATH.getName());
106
107 return metadata;
108 }
109
110 @Override
111 protected Metadata extractMetadata(
112 String extension, String mimeType, File file)
113 throws SystemException {
114
115 Metadata metadata = super.extractMetadata(extension, mimeType, file);
116
117 boolean forkProcess = false;
118
119 if (PropsValues.TEXT_EXTRACTION_FORK_PROCESS_ENABLED) {
120 if (ArrayUtil.contains(
121 PropsValues.TEXT_EXTRACTION_FORK_PROCESS_MIME_TYPES,
122 mimeType)) {
123
124 forkProcess = true;
125 }
126 }
127
128 if (forkProcess) {
129 ExtractMetadataProcessCallable extractMetadataProcessCallable =
130 new ExtractMetadataProcessCallable(file, metadata, _parser);
131
132 try {
133 Future<Metadata> future = ProcessExecutor.execute(
134 ClassPathUtil.getPortalClassPath(),
135 extractMetadataProcessCallable);
136
137 return future.get();
138 }
139 catch (Exception e) {
140 throw new SystemException(e);
141 }
142 }
143
144 InputStream inputStream = null;
145
146 try {
147 inputStream = new FileInputStream(file);
148
149 return extractMetadata(inputStream, metadata, _parser);
150 }
151 catch (IOException ioe) {
152 throw new SystemException(ioe);
153 }
154 finally {
155 StreamUtil.cleanUp(inputStream);
156 }
157 }
158
159 @Override
160 protected Metadata extractMetadata(
161 String extension, String mimeType, InputStream inputStream)
162 throws SystemException {
163
164 Metadata metadata = super.extractMetadata(
165 extension, mimeType, inputStream);
166
167 boolean forkProcess = false;
168
169 if (PropsValues.TEXT_EXTRACTION_FORK_PROCESS_ENABLED) {
170 if (ArrayUtil.contains(
171 PropsValues.TEXT_EXTRACTION_FORK_PROCESS_MIME_TYPES,
172 mimeType)) {
173
174 forkProcess = true;
175 }
176 }
177
178 if (forkProcess) {
179 File file = FileUtil.createTempFile();
180
181 try {
182 FileUtil.write(file, inputStream);
183
184 ExtractMetadataProcessCallable extractMetadataProcessCallable =
185 new ExtractMetadataProcessCallable(file, metadata, _parser);
186
187 Future<Metadata> future = ProcessExecutor.execute(
188 ClassPathUtil.getPortalClassPath(),
189 extractMetadataProcessCallable);
190
191 return future.get();
192 }
193 catch (Exception e) {
194 throw new SystemException(e);
195 }
196 finally {
197 file.delete();
198 }
199 }
200
201 try {
202 return extractMetadata(inputStream, metadata, _parser);
203 }
204 catch (IOException ioe) {
205 throw new SystemException(ioe);
206 }
207 }
208
209 private static Log _log = LogFactoryUtil.getLog(
210 TikaRawMetadataProcessor.class);
211
212 private Parser _parser;
213
214 private static class ExtractMetadataProcessCallable
215 implements ProcessCallable<Metadata> {
216
217 public ExtractMetadataProcessCallable(
218 File file, Metadata metadata, Parser parser) {
219
220 _file = file;
221 _metadata = metadata;
222 _parser = parser;
223 }
224
225 @Override
226 public Metadata call() throws ProcessException {
227 InputStream inputStream = null;
228
229 try {
230 inputStream = new FileInputStream(_file);
231
232 return extractMetadata(inputStream, _metadata, _parser);
233 }
234 catch (IOException ioe) {
235 throw new ProcessException(ioe);
236 }
237 finally {
238 if (inputStream != null) {
239 try {
240 inputStream.close();
241 }
242 catch (IOException ioe) {
243 throw new ProcessException(ioe);
244 }
245 }
246 }
247 }
248
249 private static final long serialVersionUID = 1L;
250
251 private File _file;
252 private Metadata _metadata;
253 private Parser _parser;
254
255 }
256
257 }