001 /** 002 * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved. 003 * 004 * This library is free software; you can redistribute it and/or modify it under 005 * the terms of the GNU Lesser General Public License as published by the Free 006 * Software Foundation; either version 2.1 of the License, or (at your option) 007 * any later version. 008 * 009 * This library is distributed in the hope that it will be useful, but WITHOUT 010 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 011 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 012 * details. 013 */ 014 015 package com.liferay.portlet.wiki.translators; 016 017 import com.liferay.portal.kernel.util.StringPool; 018 import com.liferay.portal.kernel.util.StringUtil; 019 import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter; 020 021 import java.util.regex.Matcher; 022 import java.util.regex.Pattern; 023 024 /** 025 * @author Jorge Ferrer 026 * @author Daniel Kocsis 027 */ 028 public class MediaWikiToCreoleTranslator extends BaseTranslator { 029 030 public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n"; 031 032 public MediaWikiToCreoleTranslator() { 033 initRegexps(); 034 initNowikiRegexps(); 035 } 036 037 public boolean isStrictImportMode() { 038 return _strictImportMode; 039 } 040 041 public void setStrictImportMode(boolean strictImportMode) { 042 _strictImportMode = strictImportMode; 043 } 044 045 protected void initNowikiRegexps() { 046 047 // Preformat protected 048 049 nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)"); 050 nowikiRegexps.add("(<pre>)(.*?)(</pre>)"); 051 052 // Escape protected 053 054 nowikiRegexps.add( 055 "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)"); 056 } 057 058 protected void initRegexps() { 059 060 // Clean unnecessary header emphasis 061 062 regexps.put("= '''([^=]+)''' =", "= $1 ="); 063 regexps.put("== '''([^=]+)''' ==", "== $1 =="); 064 regexps.put("== '''([^=]+)''' ===", "=== $1 ==="); 065 066 // Unscape angle brackets 067 068 regexps.put("<", "<"); 069 regexps.put(">", ">"); 070 071 // Remove categories 072 073 regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", ""); 074 075 // Remove disambiguations 076 077 regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK); 078 079 // Remove work in progress 080 081 regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK); 082 083 // Remove references 084 085 regexps.put("\\[{2}Wikipedia:([^\\]]*)\\]{2}", StringPool.BLANK); 086 087 // Bold and italics 088 089 regexps.put( 090 "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3"); 091 092 // Bold 093 094 regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3"); 095 096 // Italics 097 098 regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3"); 099 100 // Normalize URLs 101 102 regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]"); 103 104 // URL 105 106 regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]"); 107 108 // URL with label 109 110 regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]"); 111 112 // Term and definition 113 114 regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2"); 115 116 // Indented paragraph 117 118 regexps.put("^\\t:\\t(.*)", "$1"); 119 120 // Monospace 121 122 regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}"); 123 124 // No wiki 125 126 regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}"); 127 128 // HTML PRE 129 130 regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}"); 131 132 // User reference 133 134 regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1"); 135 } 136 137 @Override 138 protected String postProcess(String content) { 139 if (_strictImportMode) { 140 content = runRegexp( 141 content, "\\{{2}Special:(.*?)\\}{2}", StringPool.BLANK); 142 content = runRegexp(content, "\\{{2}(.*?)\\}{2}", StringPool.BLANK); 143 content = runRegexp( 144 content, "(?s)\\{{2}(.*?)\\}{2}", StringPool.BLANK); 145 } 146 else { 147 content = runRegexp( 148 content, "\\{{2}Special:(.*?)\\}{2}", "{{{$1}}}\n"); 149 content = runRegexp(content, "\\{{2}(.*?)\\}{2}", "{{{$1}}}"); 150 content = runRegexp( 151 content, "([^\\{])(\\{{2})([^\\{])", "$1\n{{{\n$3"); 152 content = runRegexp( 153 content, "([^\\}])(\\}{2})([^\\}])", "$1\n}}}\n$3"); 154 } 155 156 // LEP-6118 157 158 Pattern pattern = Pattern.compile("^=([^=]+)=", Pattern.MULTILINE); 159 160 Matcher matcher = pattern.matcher(content); 161 162 if (matcher.find()) { 163 content = runRegexp(content, "^===([^=]+)===", "====$1===="); 164 content = runRegexp(content, "^==([^=]+)==", "===$1==="); 165 content = runRegexp(content, "^=([^=]+)=", "==$1=="); 166 } 167 168 // Remove HTML tags 169 170 for (int i = 0; i < _HTML_TAGS.length; i++) { 171 content = content.replaceAll(_HTML_TAGS[i], StringPool.BLANK); 172 } 173 174 // Images 175 176 pattern = Pattern.compile("(\\[{2})(Image|File)(:)", Pattern.DOTALL); 177 178 matcher = pattern.matcher(content); 179 180 StringBuffer sb = new StringBuffer(content); 181 182 int level = 0; 183 int offset = 0; 184 int originalLength = 0; 185 int prefixLength = 0; 186 187 while (matcher.find()) { 188 level = 0; 189 prefixLength = matcher.end(2) - matcher.start(2); 190 191 for (int i = matcher.start(0) + offset; i < sb.length() - 1; i++) { 192 if ((sb.charAt(i) == '[') && (sb.charAt(i + 1) == '[')) { 193 level++; 194 } 195 else if ((sb.charAt(i) == ']') && (sb.charAt(i + 1) == ']')) { 196 level--; 197 198 if (level == 0) { 199 originalLength = (i + 2) - (matcher.start(0) + offset); 200 201 break; 202 } 203 } 204 } 205 206 int imageStartPos = matcher.end(3) + offset; 207 int imageEndPos = matcher.start(2) + offset + originalLength - 4; 208 209 String image = 210 "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" + 211 StringUtil.toLowerCase( 212 sb.substring(imageStartPos, imageEndPos)) + 213 "}}"; 214 215 int imageLength = image.length(); 216 217 image = image.replaceAll("\\[{2}", StringPool.BLANK); 218 image = image.replaceAll("\\]{2}", StringPool.BLANK); 219 220 sb.replace( 221 matcher.start(0) + offset, 222 matcher.start(0) + originalLength + offset, image); 223 224 offset += 225 MediaWikiImporter.SHARED_IMAGES_TITLE.length() - prefixLength - 226 (imageLength - image.length()); 227 } 228 229 content = sb.toString(); 230 231 // Tables 232 233 pattern = Pattern.compile("\\{\\|(.*?)\\|\\}", Pattern.DOTALL); 234 235 matcher = pattern.matcher(content); 236 237 sb = new StringBuffer(content); 238 239 String mediaWikiTable = null; 240 241 offset = 0; 242 originalLength = 0; 243 244 while (matcher.find()) { 245 mediaWikiTable = sb.substring( 246 matcher.start(1) + offset, matcher.end(1) + offset); 247 248 originalLength = mediaWikiTable.length() + 4; 249 250 mediaWikiTable = mediaWikiTable.replaceAll( 251 "class=(.*?)[|\n\r]", StringPool.BLANK); 252 mediaWikiTable = mediaWikiTable.replaceAll("(\\|\\-)(.*)", "$1"); 253 mediaWikiTable = mediaWikiTable.replaceAll( 254 "\\|\\+(.*)", "===$1==="); 255 mediaWikiTable = mediaWikiTable.replaceAll("(?m)^!(.+)", "|=$1|"); 256 mediaWikiTable = mediaWikiTable.replaceAll( 257 "[\n\r]", StringPool.BLANK); 258 mediaWikiTable = mediaWikiTable.replaceAll("\\|\\-", "\n\r"); 259 mediaWikiTable = mediaWikiTable.replaceAll("\\|\\|", "|"); 260 mediaWikiTable = mediaWikiTable.replaceAll( 261 "/{4}", StringPool.BLANK); 262 263 sb.replace( 264 matcher.start(0) + offset, 265 matcher.start(0) + originalLength + offset, mediaWikiTable); 266 267 offset += mediaWikiTable.length() - originalLength; 268 } 269 270 content = sb.toString(); 271 272 content = runRegexp(content, "/{2}(\\{{3})", "$1"); 273 content = runRegexp(content, "(\\}{3})/{2}", "$1"); 274 275 // Remove underscores from links 276 277 pattern = Pattern.compile("\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL); 278 279 matcher = pattern.matcher(content); 280 281 sb = new StringBuffer(content); 282 283 while (matcher.find()) { 284 String link = matcher.group(1).replace( 285 StringPool.UNDERLINE, StringPool.SPACE); 286 287 sb.replace(matcher.start(1), matcher.end(1), link); 288 } 289 290 return TABLE_OF_CONTENTS + super.postProcess(sb.toString()); 291 } 292 293 private static final String[] _HTML_TAGS = { 294 "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />", "<center>", 295 "</center>", "<cite>", "</cite>","<code>", "</code>", "<div[^>]*>", 296 "</div>", "<font[^>]*>", "</font>", "<hr>", "<hr/>", "<hr />", "<p>", 297 "</p>", "<tt>", "</tt>", "<var>", "</var>" 298 }; 299 300 private boolean _strictImportMode; 301 302 }