001 /** 002 * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved. 003 * 004 * This library is free software; you can redistribute it and/or modify it under 005 * the terms of the GNU Lesser General Public License as published by the Free 006 * Software Foundation; either version 2.1 of the License, or (at your option) 007 * any later version. 008 * 009 * This library is distributed in the hope that it will be useful, but WITHOUT 010 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 011 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 012 * details. 013 */ 014 015 package com.liferay.portlet.wiki.translators; 016 017 import com.liferay.portal.kernel.util.StringPool; 018 import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter; 019 020 import java.util.regex.Matcher; 021 import java.util.regex.Pattern; 022 023 /** 024 * @author Jorge Ferrer 025 */ 026 public class MediaWikiToCreoleTranslator extends BaseTranslator { 027 028 public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n"; 029 030 public MediaWikiToCreoleTranslator() { 031 initRegexps(); 032 initNowikiRegexps(); 033 } 034 035 protected void initNowikiRegexps() { 036 037 // Preformat protected 038 039 nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)"); 040 nowikiRegexps.add("(<pre>)(.*?)(</pre>)"); 041 042 // Escape protected 043 044 nowikiRegexps.add( 045 "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)"); 046 } 047 048 protected void initRegexps() { 049 050 // Clean unnecessary header emphasis 051 052 regexps.put("= '''([^=]+)''' =", "= $1 ="); 053 regexps.put("== '''([^=]+)''' ==", "== $1 =="); 054 regexps.put("== '''([^=]+)''' ===", "=== $1 ==="); 055 056 // Unscape angle brackets 057 058 regexps.put("<", "<"); 059 regexps.put(">", ">"); 060 061 // Remove categories 062 063 regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", ""); 064 065 // Remove disambiguations 066 067 regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK); 068 069 // Remove work in progress 070 071 regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK); 072 073 // Bold and italics 074 075 regexps.put( 076 "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3"); 077 078 // Bold 079 080 regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3"); 081 082 // Italics 083 084 regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3"); 085 086 // Normalize URLs 087 088 regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]"); 089 090 // URL 091 092 regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]"); 093 094 // URL with label 095 096 regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]"); 097 098 // Term and definition 099 100 regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2"); 101 102 // Indented paragraph 103 104 regexps.put("^\\t:\\t(.*)", "$1"); 105 106 // Monospace 107 108 regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}"); 109 110 // No wiki 111 112 regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}"); 113 114 // HTML PRE 115 116 regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}"); 117 118 // User reference 119 120 regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1"); 121 } 122 123 protected String postProcess(String content) { 124 125 // LEP-6118 126 127 Matcher matcher = Pattern.compile( 128 "^=([^=]+)=", Pattern.MULTILINE).matcher(content); 129 130 if (matcher.find()) { 131 content = runRegexp(content, "^===([^=]+)===", "====$1===="); 132 content = runRegexp(content, "^==([^=]+)==", "===$1==="); 133 content = runRegexp(content, "^=([^=]+)=", "==$1=="); 134 } 135 136 // Remove HTML tags 137 138 for (int i = 0; i < _HTML_TAGS.length; i++) { 139 content = content.replaceAll(_HTML_TAGS[i], StringPool.BLANK); 140 } 141 142 // Images 143 144 matcher = Pattern.compile( 145 "\\[{2}Image:([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content); 146 147 StringBuffer sb = new StringBuffer(content); 148 149 int offset = 0; 150 151 while (matcher.find()) { 152 String image = 153 "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" + 154 matcher.group(1).toLowerCase() + "}}"; 155 156 sb.replace( 157 matcher.start(0) + offset, matcher.end(0) + offset, image); 158 159 offset += MediaWikiImporter.SHARED_IMAGES_TITLE.length() - 5; 160 } 161 162 content = sb.toString(); 163 164 // Remove underscores from links 165 166 matcher = Pattern.compile( 167 "\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content); 168 169 sb = new StringBuffer(content); 170 171 while (matcher.find()) { 172 String link = matcher.group(1).replace( 173 StringPool.UNDERLINE, StringPool.SPACE); 174 175 sb.replace(matcher.start(1), matcher.end(1), link); 176 } 177 178 return TABLE_OF_CONTENTS + super.postProcess(sb.toString()); 179 } 180 181 private static final String[] _HTML_TAGS = { 182 "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />", "<center>", 183 "</center>", "<cite>", "</cite>","<code>", "</code>", "<div[^>]*>", 184 "</div>", "<font[^>]*>", "</font>", "<hr>", "<hr/>", "<hr />", "<p>", 185 "</p>", "<tt>", "</tt>", "<var>", "</var>"}; 186 187 }