001    /**
002     * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.util.lucene;
016    
017    import com.liferay.portal.kernel.io.unsync.UnsyncStringReader;
018    
019    import java.io.IOException;
020    import java.io.InputStream;
021    import java.io.Reader;
022    
023    import net.htmlparser.jericho.Source;
024    
025    import org.apache.jackrabbit.extractor.HTMLTextExtractor;
026    
027    /**
028     * @author Brian Wing Shun Chan
029     */
030    public class JerichoHTMLTextExtractor extends HTMLTextExtractor {
031    
032            public Reader extractText(InputStream stream, String type, String encoding)
033                    throws IOException {
034    
035                    Source source = new Source(stream);
036    
037                    return new UnsyncStringReader(source.getTextExtractor().toString());
038            }
039    
040    }