001 /** 002 * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved. 003 * 004 * This library is free software; you can redistribute it and/or modify it under 005 * the terms of the GNU Lesser General Public License as published by the Free 006 * Software Foundation; either version 2.1 of the License, or (at your option) 007 * any later version. 008 * 009 * This library is distributed in the hope that it will be useful, but WITHOUT 010 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 011 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 012 * details. 013 */ 014 015 package com.liferay.util.lucene; 016 017 import com.liferay.portal.kernel.io.unsync.UnsyncStringReader; 018 019 import java.io.IOException; 020 import java.io.InputStream; 021 import java.io.Reader; 022 023 import net.htmlparser.jericho.Source; 024 025 import org.apache.jackrabbit.extractor.HTMLTextExtractor; 026 027 /** 028 * @author Brian Wing Shun Chan 029 */ 030 public class JerichoHTMLTextExtractor extends HTMLTextExtractor { 031 032 public Reader extractText(InputStream stream, String type, String encoding) 033 throws IOException { 034 035 Source source = new Source(stream); 036 037 return new UnsyncStringReader(source.getTextExtractor().toString()); 038 } 039 040 }