001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.search.lucene;
016    
017    import com.liferay.portal.kernel.io.unsync.UnsyncByteArrayInputStream;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.FileUtil;
020    import com.liferay.portal.kernel.util.Validator;
021    import com.liferay.portal.util.PropsValues;
022    
023    import java.io.File;
024    import java.io.FileInputStream;
025    import java.io.IOException;
026    import java.io.InputStream;
027    
028    import org.apache.lucene.document.Field;
029    
030    /**
031     * @author Brian Wing Shun Chan
032     */
033    public class LuceneFileExtractor {
034    
035            public Field getFile(String field, byte[] bytes, String fileExt) {
036                    InputStream is = new UnsyncByteArrayInputStream(bytes);
037    
038                    return getFile(field, is, fileExt);
039            }
040    
041            public Field getFile(String field, File file, String fileExt)
042                    throws IOException {
043    
044                    InputStream is = new FileInputStream(file);
045    
046                    return getFile(field, is, fileExt);
047            }
048    
049            public Field getFile(String field, InputStream is, String fileExt) {
050                    String text = FileUtil.extractText(is, fileExt);
051    
052                    if (Validator.isNotNull(
053                                    PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
054    
055                            text = regexpStrip(text);
056                    }
057    
058                    return LuceneFields.getText(field, text);
059            }
060    
061            protected String regexpStrip(String text) {
062                    char[] array = text.toCharArray();
063    
064                    for (int i = 0; i < array.length; i++) {
065                            String s = String.valueOf(array[i]);
066    
067                            if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
068                                    array[i] = CharPool.SPACE;
069                            }
070                    }
071    
072                    return new String(array);
073            }
074    
075    }