001 /** 002 * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved. 003 * 004 * This library is free software; you can redistribute it and/or modify it under 005 * the terms of the GNU Lesser General Public License as published by the Free 006 * Software Foundation; either version 2.1 of the License, or (at your option) 007 * any later version. 008 * 009 * This library is distributed in the hope that it will be useful, but WITHOUT 010 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 011 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 012 * details. 013 */ 014 015 package com.liferay.portlet.wiki.translators; 016 017 import com.liferay.portal.kernel.util.StringPool; 018 import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter; 019 020 import java.util.regex.Matcher; 021 import java.util.regex.Pattern; 022 023 /** 024 * @author Jorge Ferrer 025 * @author Daniel Kocsis 026 */ 027 public class MediaWikiToCreoleTranslator extends BaseTranslator { 028 029 public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n"; 030 031 public MediaWikiToCreoleTranslator() { 032 initRegexps(); 033 initNowikiRegexps(); 034 } 035 036 public boolean isStrictImportMode() { 037 return _strictImportMode; 038 } 039 040 public void setStrictImportMode(boolean strictImportMode) { 041 _strictImportMode = strictImportMode; 042 } 043 044 protected void initNowikiRegexps() { 045 046 // Preformat protected 047 048 nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)"); 049 nowikiRegexps.add("(<pre>)(.*?)(</pre>)"); 050 051 // Escape protected 052 053 nowikiRegexps.add( 054 "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)"); 055 } 056 057 protected void initRegexps() { 058 059 // Clean unnecessary header emphasis 060 061 regexps.put("= '''([^=]+)''' =", "= $1 ="); 062 regexps.put("== '''([^=]+)''' ==", "== $1 =="); 063 regexps.put("== '''([^=]+)''' ===", "=== $1 ==="); 064 065 // Unscape angle brackets 066 067 regexps.put("<", "<"); 068 regexps.put(">", ">"); 069 070 // Remove categories 071 072 regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", ""); 073 074 // Remove disambiguations 075 076 regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK); 077 078 // Remove work in progress 079 080 regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK); 081 082 // Remove references 083 084 regexps.put("\\[{2}Wikipedia:([^\\]]*)\\]{2}", StringPool.BLANK); 085 086 // Bold and italics 087 088 regexps.put( 089 "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3"); 090 091 // Bold 092 093 regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3"); 094 095 // Italics 096 097 regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3"); 098 099 // Normalize URLs 100 101 regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]"); 102 103 // URL 104 105 regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]"); 106 107 // URL with label 108 109 regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]"); 110 111 // Term and definition 112 113 regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2"); 114 115 // Indented paragraph 116 117 regexps.put("^\\t:\\t(.*)", "$1"); 118 119 // Monospace 120 121 regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}"); 122 123 // No wiki 124 125 regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}"); 126 127 // HTML PRE 128 129 regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}"); 130 131 // User reference 132 133 regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1"); 134 } 135 136 @Override 137 protected String postProcess(String content) { 138 if (_strictImportMode) { 139 content = runRegexp( 140 content, "\\{{2}Special:(.*?)\\}{2}", StringPool.BLANK); 141 content = runRegexp(content, "\\{{2}(.*?)\\}{2}", StringPool.BLANK); 142 content = runRegexp( 143 content, "(?s)\\{{2}(.*?)\\}{2}", StringPool.BLANK); 144 } 145 else { 146 content = runRegexp( 147 content, "\\{{2}Special:(.*?)\\}{2}", "{{{$1}}}\n"); 148 content = runRegexp(content, "\\{{2}(.*?)\\}{2}", "{{{$1}}}"); 149 content = runRegexp( 150 content, "([^\\{])(\\{{2})([^\\{])", "$1\n{{{\n$3"); 151 content = runRegexp( 152 content, "([^\\}])(\\}{2})([^\\}])", "$1\n}}}\n$3"); 153 } 154 155 // LEP-6118 156 157 Matcher matcher = Pattern.compile( 158 "^=([^=]+)=", Pattern.MULTILINE).matcher(content); 159 160 if (matcher.find()) { 161 content = runRegexp(content, "^===([^=]+)===", "====$1===="); 162 content = runRegexp(content, "^==([^=]+)==", "===$1==="); 163 content = runRegexp(content, "^=([^=]+)=", "==$1=="); 164 } 165 166 // Remove HTML tags 167 168 for (int i = 0; i < _HTML_TAGS.length; i++) { 169 content = content.replaceAll(_HTML_TAGS[i], StringPool.BLANK); 170 } 171 172 // Images 173 174 matcher = Pattern.compile( 175 "(\\[{2})(Image|File)(:)", Pattern.DOTALL).matcher(content); 176 177 StringBuffer sb = new StringBuffer(content); 178 179 int level = 0; 180 int offset = 0; 181 int originalLength = 0; 182 int prefixLength = 0; 183 184 while (matcher.find()) { 185 level = 0; 186 prefixLength = matcher.end(2) - matcher.start(2); 187 188 for (int i = matcher.start(0) + offset; i < sb.length() - 1; i++) { 189 if ((sb.charAt(i) == '[') && (sb.charAt(i + 1) == '[')) { 190 level++; 191 } 192 else if ((sb.charAt(i) == ']') && (sb.charAt(i + 1) == ']')) { 193 level--; 194 195 if (level == 0) { 196 originalLength = (i + 2) - (matcher.start(0) + offset); 197 198 break; 199 } 200 } 201 } 202 203 int imageStartPos = matcher.end(3) + offset; 204 int imageEndPos = matcher.start(2) + offset + originalLength - 4; 205 206 String image = 207 "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" + 208 sb.substring(imageStartPos, imageEndPos).toLowerCase() + 209 "}}"; 210 211 int imageLength = image.length(); 212 213 image = image.replaceAll("\\[{2}", StringPool.BLANK); 214 image = image.replaceAll("\\]{2}", StringPool.BLANK); 215 216 sb.replace( 217 matcher.start(0) + offset, 218 matcher.start(0) + originalLength + offset, image); 219 220 offset += 221 MediaWikiImporter.SHARED_IMAGES_TITLE.length() - prefixLength - 222 (imageLength - image.length()); 223 } 224 225 content = sb.toString(); 226 227 // Tables 228 229 matcher = Pattern.compile( 230 "\\{\\|(.*?)\\|\\}", Pattern.DOTALL).matcher(content); 231 232 sb = new StringBuffer(content); 233 234 String mediaWikiTable = null; 235 236 offset = 0; 237 originalLength = 0; 238 239 while (matcher.find()) { 240 mediaWikiTable = sb.substring( 241 matcher.start(1) + offset, matcher.end(1) + offset); 242 243 originalLength = mediaWikiTable.length() + 4; 244 245 mediaWikiTable = mediaWikiTable.replaceAll( 246 "class=(.*?)[|\n\r]", StringPool.BLANK); 247 mediaWikiTable = mediaWikiTable.replaceAll("(\\|\\-)(.*)", "$1"); 248 mediaWikiTable = mediaWikiTable.replaceAll( 249 "\\|\\+(.*)", "===$1==="); 250 mediaWikiTable = mediaWikiTable.replaceAll("(?m)^!(.+)", "|=$1|"); 251 mediaWikiTable = mediaWikiTable.replaceAll( 252 "[\n\r]", StringPool.BLANK); 253 mediaWikiTable = mediaWikiTable.replaceAll("\\|\\-", "\n\r"); 254 mediaWikiTable = mediaWikiTable.replaceAll("\\|\\|", "|"); 255 mediaWikiTable = mediaWikiTable.replaceAll( 256 "/{4}", StringPool.BLANK); 257 258 sb.replace( 259 matcher.start(0) + offset, 260 matcher.start(0) + originalLength + offset, mediaWikiTable); 261 262 offset = mediaWikiTable.length() - originalLength; 263 } 264 265 content = sb.toString(); 266 267 content = runRegexp(content, "/{2}(\\{{3})", "$1"); 268 content = runRegexp(content, "(\\}{3})/{2}", "$1"); 269 270 // Remove underscores from links 271 272 matcher = Pattern.compile( 273 "\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content); 274 275 sb = new StringBuffer(content); 276 277 while (matcher.find()) { 278 String link = matcher.group(1).replace( 279 StringPool.UNDERLINE, StringPool.SPACE); 280 281 sb.replace(matcher.start(1), matcher.end(1), link); 282 } 283 284 return TABLE_OF_CONTENTS + super.postProcess(sb.toString()); 285 } 286 287 private static final String[] _HTML_TAGS = { 288 "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />", "<center>", 289 "</center>", "<cite>", "</cite>","<code>", "</code>", "<div[^>]*>", 290 "</div>", "<font[^>]*>", "</font>", "<hr>", "<hr/>", "<hr />", "<p>", 291 "</p>", "<tt>", "</tt>", "<var>", "</var>" 292 }; 293 294 private boolean _strictImportMode; 295 296 }