1   /**
2    * Copyright (c) 2000-2009 Liferay, Inc. All rights reserved.
3    *
4    * Permission is hereby granted, free of charge, to any person obtaining a copy
5    * of this software and associated documentation files (the "Software"), to deal
6    * in the Software without restriction, including without limitation the rights
7    * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8    * copies of the Software, and to permit persons to whom the Software is
9    * furnished to do so, subject to the following conditions:
10   *
11   * The above copyright notice and this permission notice shall be included in
12   * all copies or substantial portions of the Software.
13   *
14   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20   * SOFTWARE.
21   */
22  
23  package com.liferay.portal.util;
24  
25  import au.id.jericho.lib.html.Source;
26  
27  import com.liferay.portal.kernel.util.Html;
28  import com.liferay.portal.kernel.util.StringPool;
29  import com.liferay.portal.kernel.util.StringUtil;
30  
31  /**
32   * <a href="HtmlImpl.java.html"><b><i>View Source</i></b></a>
33   *
34   * @author Brian Wing Shun Chan
35   * @author Clarence Shen
36   * @author Harry Mark
37   *
38   */
39  public class HtmlImpl implements Html {
40  
41      public String escape(String text) {
42          if (text == null) {
43              return null;
44          }
45  
46          // Escape using XSS recommendations from
47          // http://www.owasp.org/index.php/Cross_Site_Scripting
48          // #How_to_Protect_Yourself
49  
50          StringBuilder sb = new StringBuilder(text.length());
51  
52          for (int i = 0; i < text.length(); i++) {
53              char c = text.charAt(i);
54  
55              switch (c) {
56                  case '<':
57                      sb.append("&lt;");
58  
59                      break;
60  
61                  case '>':
62                      sb.append("&gt;");
63  
64                      break;
65  
66                  case '&':
67                      sb.append("&amp;");
68  
69                      break;
70  
71                  case '"':
72                      sb.append("&#034;");
73  
74                      break;
75  
76                  case '\'':
77                      sb.append("&#039;");
78  
79                      break;
80  
81                  case '(':
82                      sb.append("&#040;");
83  
84                      break;
85  
86                  case ')':
87                      sb.append("&#041;");
88  
89                      break;
90  
91                  case '#':
92                      sb.append("&#035;");
93  
94                      break;
95  
96                  case '%':
97                      sb.append("&#037;");
98  
99                      break;
100 
101                 case ';':
102                     sb.append("&#059;");
103 
104                     break;
105 
106                 case '+':
107                     sb.append("&#043;");
108 
109                     break;
110 
111                 case '-':
112                     sb.append("&#045;");
113 
114                     break;
115 
116                 default:
117                     sb.append(c);
118 
119                     break;
120             }
121         }
122 
123         return sb.toString();
124     }
125 
126     public String extractText(String html) {
127         if (html == null) {
128             return null;
129         }
130 
131         Source source = new Source(html);
132 
133         return source.getTextExtractor().toString();
134     }
135 
136     public String fromInputSafe(String text) {
137         return StringUtil.replace(text, "&amp;", "&");
138     }
139 
140     public String replaceMsWordCharacters(String text) {
141         return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
142     }
143 
144     public String stripBetween(String text, String tag) {
145         return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
146     }
147 
148     public String stripComments(String text) {
149         return StringUtil.stripBetween(text, "<!--", "-->");
150     }
151 
152     public String stripHtml(String text) {
153         if (text == null) {
154             return null;
155         }
156 
157         text = stripComments(text);
158 
159         StringBuilder sb = new StringBuilder(text.length());
160 
161         int x = 0;
162         int y = text.indexOf("<");
163 
164         while (y != -1) {
165             sb.append(text.substring(x, y));
166             sb.append(StringPool.SPACE);
167 
168             // Look for text enclosed by <script></script>
169 
170             boolean scriptFound = isScriptTag(text, y + 1);
171 
172             if (scriptFound) {
173                 int pos = y + _TAG_SCRIPT.length;
174 
175                 // Find end of the tag
176 
177                 pos = text.indexOf(">", pos);
178 
179                 if (pos >= 0) {
180 
181                     // Check if preceding character is / (i.e. is this instance
182                     // of <script/>)
183 
184                     if (text.charAt(pos-1) != '/') {
185 
186                         // Search for the ending </script> tag
187 
188                         for (;;) {
189                             pos = text.indexOf("</", pos);
190 
191                             if (pos >= 0) {
192                                 if (isScriptTag(text, pos + 2)) {
193                                     y = pos;
194 
195                                     break;
196                                 }
197                                 else {
198 
199                                     // Skip past "</"
200 
201                                     pos += 2;
202                                 }
203                             }
204                             else {
205                                 break;
206                             }
207                         }
208                     }
209                 }
210             }
211 
212             x = text.indexOf(">", y);
213 
214             if (x == -1) {
215                 break;
216             }
217 
218             x++;
219 
220             if (x < y) {
221 
222                 // <b>Hello</b
223 
224                 break;
225             }
226 
227             y = text.indexOf("<", x);
228         }
229 
230         if (y == -1) {
231             sb.append(text.substring(x, text.length()));
232         }
233 
234         return sb.toString();
235     }
236 
237     public String toInputSafe(String text) {
238         return StringUtil.replace(
239             text,
240             new String[] {"&", "\""},
241             new String[] {"&amp;", "&quot;"});
242     }
243 
244     public String unescape(String text) {
245         if (text == null) {
246             return null;
247         }
248 
249         // Optimize this
250 
251         text = StringUtil.replace(text, "&lt;", "<");
252         text = StringUtil.replace(text, "&gt;", ">");
253         text = StringUtil.replace(text, "&amp;", "&");
254         text = StringUtil.replace(text, "&#034;", "\"");
255         text = StringUtil.replace(text, "&#039;", "'");
256         text = StringUtil.replace(text, "&#040;", "(");
257         text = StringUtil.replace(text, "&#041;", ")");
258         text = StringUtil.replace(text, "&#035;", "#");
259         text = StringUtil.replace(text, "&#037;", "%");
260         text = StringUtil.replace(text, "&#059;", ";");
261         text = StringUtil.replace(text, "&#043;", "+");
262         text = StringUtil.replace(text, "&#045;", "-");
263 
264         return text;
265     }
266 
267     protected boolean isScriptTag(String text, int pos) {
268         if (pos + _TAG_SCRIPT.length + 1 <= text.length()) {
269             char item;
270 
271             for (int i = 0; i < _TAG_SCRIPT.length; i++) {
272                 item = text.charAt(pos++);
273 
274                 if (Character.toLowerCase(item) != _TAG_SCRIPT[i]) {
275                     return false;
276                 }
277             }
278 
279             item = text.charAt(pos);
280 
281             // Check that char after "script" is not a letter (i.e. another tag)
282 
283             return !Character.isLetter(item);
284         }
285         else {
286             return false;
287         }
288     }
289 
290     private static final String[] _MS_WORD_UNICODE = new String[] {
291         "\u00ae", "\u2019", "\u201c", "\u201d"
292     };
293 
294     private static final String[] _MS_WORD_HTML = new String[] {
295         "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
296     };
297 
298     private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
299 
300 }