001    /**
002     * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.util.CharPool;
018    import com.liferay.portal.kernel.util.Html;
019    import com.liferay.portal.kernel.util.HttpUtil;
020    import com.liferay.portal.kernel.util.StringBundler;
021    import com.liferay.portal.kernel.util.StringPool;
022    import com.liferay.portal.kernel.util.StringUtil;
023    
024    import java.util.regex.Matcher;
025    import java.util.regex.Pattern;
026    
027    import net.htmlparser.jericho.Source;
028    
029    /**
030     * @author Brian Wing Shun Chan
031     * @author Clarence Shen
032     * @author Harry Mark
033     * @author Samuel Kong
034     * @author Connor McKay
035     */
036    public class HtmlImpl implements Html {
037    
038            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
039    
040            public static final int ESCAPE_MODE_CSS = 2;
041    
042            public static final int ESCAPE_MODE_JS = 3;
043    
044            public static final int ESCAPE_MODE_TEXT = 4;
045    
046            public static final int ESCAPE_MODE_URL = 5;
047    
048            public String escape(String text) {
049                    if (text == null) {
050                            return null;
051                    }
052    
053                    if (text.length() == 0) {
054                            return StringPool.BLANK;
055                    }
056    
057                    // Escape using XSS recommendations from
058                    // http://www.owasp.org/index.php/Cross_Site_Scripting
059                    // #How_to_Protect_Yourself
060    
061                    StringBuilder sb = new StringBuilder(text.length());
062    
063                    for (int i = 0; i < text.length(); i++) {
064                            char c = text.charAt(i);
065    
066                            switch (c) {
067                                    case '<':
068                                            sb.append("&lt;");
069    
070                                            break;
071    
072                                    case '>':
073                                            sb.append("&gt;");
074    
075                                            break;
076    
077                                    case '&':
078                                            sb.append("&amp;");
079    
080                                            break;
081    
082                                    case '"':
083                                            sb.append("&#034;");
084    
085                                            break;
086    
087                                    case '\'':
088                                            sb.append("&#039;");
089    
090                                            break;
091    
092                                    case '»':
093                                            sb.append("&raquo;");
094    
095                                            break;
096    
097                                    default:
098                                            sb.append(c);
099    
100                                            break;
101                            }
102                    }
103    
104                    return sb.toString();
105            }
106    
107            public String escape(String text, int type) {
108                    if (text == null) {
109                            return null;
110                    }
111    
112                    if (text.length() == 0) {
113                            return StringPool.BLANK;
114                    }
115    
116                    String prefix = StringPool.BLANK;
117                    String postfix = StringPool.BLANK;
118    
119                    if (type == ESCAPE_MODE_ATTRIBUTE) {
120                            prefix = "&#x";
121                            postfix = StringPool.SEMICOLON;
122                    }
123                    else if (type == ESCAPE_MODE_CSS) {
124                            prefix = StringPool.BACK_SLASH;
125                    }
126                    else if (type == ESCAPE_MODE_JS) {
127                            prefix = "\\x";
128                    }
129                    else if (type == ESCAPE_MODE_URL) {
130                            return HttpUtil.encodeURL(text, true);
131                    }
132                    else {
133                            return escape(text);
134                    }
135    
136                    StringBuilder sb = new StringBuilder();
137    
138                    for (int i = 0; i < text.length(); i++) {
139                            char c = text.charAt(i);
140    
141                            if ((Character.isLetterOrDigit(c)) ||
142                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
143    
144                                    sb.append(c);
145                            }
146                            else {
147                                    sb.append(prefix);
148                                    sb.append(Integer.toHexString(c));
149                                    sb.append(postfix);
150                            }
151                    }
152    
153                    return sb.toString();
154            }
155    
156            public String escapeAttribute(String attribute) {
157                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
158            }
159    
160            public String escapeCSS(String css) {
161                    return escape(css, ESCAPE_MODE_CSS);
162            }
163    
164            public String escapeHREF(String href) {
165                    if (href == null) {
166                            return null;
167                    }
168    
169                    if (href.length() == 0) {
170                            return StringPool.BLANK;
171                    }
172    
173                    if (href.indexOf(StringPool.COLON) == 10) {
174                            String protocol = href.substring(0, 10).toLowerCase();
175    
176                            if (protocol.equals("javascript")) {
177                                    return StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
178                            }
179                    }
180    
181                    return href;
182            }
183    
184            public String escapeJS(String js) {
185                    return escape(js, ESCAPE_MODE_JS);
186            }
187    
188            public String escapeURL(String url) {
189                    return escape(url, ESCAPE_MODE_URL);
190            }
191    
192            public String extractText(String html) {
193                    if (html == null) {
194                            return null;
195                    }
196    
197                    Source source = new Source(html);
198    
199                    return source.getTextExtractor().toString();
200            }
201    
202            public String fromInputSafe(String text) {
203                    return StringUtil.replace(text, "&amp;", "&");
204            }
205    
206            public String replaceMsWordCharacters(String text) {
207                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
208            }
209    
210            public String stripBetween(String text, String tag) {
211                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
212            }
213    
214            public String stripComments(String text) {
215                    return StringUtil.stripBetween(text, "<!--", "-->");
216            }
217    
218            public String stripHtml(String text) {
219                    if (text == null) {
220                            return null;
221                    }
222    
223                    text = stripComments(text);
224    
225                    StringBuilder sb = new StringBuilder(text.length());
226    
227                    int x = 0;
228                    int y = text.indexOf("<");
229    
230                    while (y != -1) {
231                            sb.append(text.substring(x, y));
232                            sb.append(StringPool.SPACE);
233    
234                            // Look for text enclosed by <script></script>
235    
236                            boolean scriptFound = isScriptTag(text, y + 1);
237    
238                            if (scriptFound) {
239                                    int pos = y + _TAG_SCRIPT.length;
240    
241                                    // Find end of the tag
242    
243                                    pos = text.indexOf(">", pos);
244    
245                                    if (pos >= 0) {
246    
247                                            // Check if preceding character is / (i.e. is this instance
248                                            // of <script/>)
249    
250                                            if (text.charAt(pos-1) != '/') {
251    
252                                                    // Search for the ending </script> tag
253    
254                                                    for (;;) {
255                                                            pos = text.indexOf("</", pos);
256    
257                                                            if (pos >= 0) {
258                                                                    if (isScriptTag(text, pos + 2)) {
259                                                                            y = pos;
260    
261                                                                            break;
262                                                                    }
263                                                                    else {
264    
265                                                                            // Skip past "</"
266    
267                                                                            pos += 2;
268                                                                    }
269                                                            }
270                                                            else {
271                                                                    break;
272                                                            }
273                                                    }
274                                            }
275                                    }
276                            }
277    
278                            x = text.indexOf(">", y);
279    
280                            if (x == -1) {
281                                    break;
282                            }
283    
284                            x++;
285    
286                            if (x < y) {
287    
288                                    // <b>Hello</b
289    
290                                    break;
291                            }
292    
293                            y = text.indexOf("<", x);
294                    }
295    
296                    if (y == -1) {
297                            sb.append(text.substring(x, text.length()));
298                    }
299    
300                    return sb.toString();
301            }
302    
303            public String toInputSafe(String text) {
304                    return StringUtil.replace(
305                            text,
306                            new String[] {"&", "\""},
307                            new String[] {"&amp;", "&quot;"});
308            }
309    
310            public String unescape(String text) {
311                    if (text == null) {
312                            return null;
313                    }
314    
315                    if (text.length() == 0) {
316                            return StringPool.BLANK;
317                    }
318    
319                    // Optimize this
320    
321                    text = StringUtil.replace(text, "&lt;", "<");
322                    text = StringUtil.replace(text, "&gt;", ">");
323                    text = StringUtil.replace(text, "&amp;", "&");
324                    text = StringUtil.replace(text, "&#034;", "\"");
325                    text = StringUtil.replace(text, "&#039;", "'");
326                    text = StringUtil.replace(text, "&#040;", "(");
327                    text = StringUtil.replace(text, "&#041;", ")");
328                    text = StringUtil.replace(text, "&#044;", ",");
329                    text = StringUtil.replace(text, "&#035;", "#");
330                    text = StringUtil.replace(text, "&#037;", "%");
331                    text = StringUtil.replace(text, "&#059;", ";");
332                    text = StringUtil.replace(text, "&#061;", "=");
333                    text = StringUtil.replace(text, "&#043;", "+");
334                    text = StringUtil.replace(text, "&#045;", "-");
335    
336                    return text;
337            }
338    
339            public String wordBreak(String text, int columns) {
340                    StringBundler sb = new StringBundler();
341    
342                    int length = 0;
343                    int lastWrite = 0;
344                    int pos = 0;
345    
346                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
347    
348                    Matcher matcher = pattern.matcher(text);
349    
350                    while (matcher.find()) {
351                            if (matcher.start() < pos) {
352                                    continue;
353                            }
354    
355                            while ((length + matcher.start() - pos) >= columns) {
356                                    pos += columns - length;
357    
358                                    sb.append(text.substring(lastWrite, pos));
359                                    sb.append("<wbr/>");
360    
361                                    length = 0;
362                                    lastWrite = pos;
363                            }
364    
365                            length += matcher.start() - pos;
366    
367                            String group = matcher.group();
368    
369                            if (group.equals(StringPool.AMPERSAND)) {
370                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
371    
372                                    if (x != -1) {
373                                            length++;
374                                            pos = x + 1;
375                                    }
376    
377                                    continue;
378                            }
379    
380                            if (group.equals(StringPool.LESS_THAN)) {
381                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
382    
383                                    if (x != -1) {
384                                            pos = x + 1;
385                                    }
386    
387                                    continue;
388                            }
389    
390                            if (group.equals(StringPool.SPACE) ||
391                                    group.equals(StringPool.NEW_LINE)) {
392    
393                                    length = 0;
394                                    pos = matcher.start() + 1;
395                            }
396                    }
397    
398                    sb.append(text.substring(lastWrite));
399    
400                    return sb.toString();
401            }
402    
403            protected boolean isScriptTag(String text, int pos) {
404                    if (pos + _TAG_SCRIPT.length + 1 <= text.length()) {
405                            char item;
406    
407                            for (int i = 0; i < _TAG_SCRIPT.length; i++) {
408                                    item = text.charAt(pos++);
409    
410                                    if (Character.toLowerCase(item) != _TAG_SCRIPT[i]) {
411                                            return false;
412                                    }
413                            }
414    
415                            item = text.charAt(pos);
416    
417                            // Check that char after "script" is not a letter (i.e. another tag)
418    
419                            return !Character.isLetter(item);
420                    }
421                    else {
422                            return false;
423                    }
424            }
425    
426            private static final String[] _MS_WORD_UNICODE = new String[] {
427                    "\u00ae", "\u2019", "\u201c", "\u201d"
428            };
429    
430            private static final String[] _MS_WORD_HTML = new String[] {
431                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
432            };
433    
434            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
435    
436    }