001    /**
002     * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portlet.wiki.translators;
016    
017    import com.liferay.portal.kernel.util.StringPool;
018    import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter;
019    
020    import java.util.regex.Matcher;
021    import java.util.regex.Pattern;
022    
023    /**
024     * @author Jorge Ferrer
025     */
026    public class MediaWikiToCreoleTranslator extends BaseTranslator {
027    
028            public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n";
029    
030            public MediaWikiToCreoleTranslator() {
031                    initRegexps();
032                    initNowikiRegexps();
033            }
034    
035            protected void initNowikiRegexps() {
036    
037                    // Preformat protected
038    
039                    nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)");
040                    nowikiRegexps.add("(<pre>)(.*?)(</pre>)");
041    
042                    // Escape protected
043    
044                    nowikiRegexps.add(
045                            "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)");
046            }
047    
048            protected void initRegexps() {
049    
050                    // Clean unnecessary header emphasis
051    
052                    regexps.put("= '''([^=]+)''' =", "= $1 =");
053                    regexps.put("== '''([^=]+)''' ==", "== $1 ==");
054                    regexps.put("== '''([^=]+)''' ===", "=== $1 ===");
055    
056                    // Unscape angle brackets
057    
058                    regexps.put("&lt;", "<");
059                    regexps.put("&gt;", ">");
060    
061                    // Remove categories
062    
063                    regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", "");
064    
065                    // Remove disambiguations
066    
067                    regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK);
068    
069                    // Remove work in progress
070    
071                    regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK);
072    
073                    // Bold and italics
074    
075                    regexps.put(
076                            "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3");
077    
078                    // Bold
079    
080                    regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3");
081    
082                    // Italics
083    
084                    regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3");
085    
086                    // Normalize URLs
087    
088                    regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]");
089    
090                    // URL
091    
092                    regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]");
093    
094                    // URL with label
095    
096                    regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]");
097    
098                    // Term and definition
099    
100                    regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2");
101    
102                    // Indented paragraph
103    
104                    regexps.put("^\\t:\\t(.*)", "$1");
105    
106                    // Monospace
107    
108                    regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}");
109    
110                    // No wiki
111    
112                    regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}");
113    
114                    // HTML PRE
115    
116                    regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}");
117    
118                    // User reference
119    
120                    regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1");
121            }
122    
123            protected String postProcess(String content) {
124    
125                    // LEP-6118
126    
127                    Matcher matcher = Pattern.compile(
128                            "^=([^=]+)=", Pattern.MULTILINE).matcher(content);
129    
130                    if (matcher.find()) {
131                            content = runRegexp(content, "^===([^=]+)===", "====$1====");
132                            content = runRegexp(content, "^==([^=]+)==", "===$1===");
133                            content = runRegexp(content, "^=([^=]+)=", "==$1==");
134                    }
135    
136                    // Remove HTML tags
137    
138                    for (int i = 0; i < _HTML_TAGS.length; i++) {
139                            content = content.replaceAll(_HTML_TAGS[i], StringPool.BLANK);
140                    }
141    
142                    // Images
143    
144                    matcher = Pattern.compile(
145                            "\\[{2}Image:([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content);
146    
147                    StringBuffer sb = new StringBuffer(content);
148    
149                    int offset = 0;
150    
151                    while (matcher.find()) {
152                            String image =
153                                    "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" +
154                                            matcher.group(1).toLowerCase() + "}}";
155    
156                            sb.replace(
157                                    matcher.start(0) + offset, matcher.end(0) + offset, image);
158    
159                            offset += MediaWikiImporter.SHARED_IMAGES_TITLE.length() - 5;
160                    }
161    
162                    content = sb.toString();
163    
164                    // Remove underscores from links
165    
166                    matcher = Pattern.compile(
167                            "\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content);
168    
169                    sb = new StringBuffer(content);
170    
171                    while (matcher.find()) {
172                            String link = matcher.group(1).replace(
173                                    StringPool.UNDERLINE, StringPool.SPACE);
174    
175                            sb.replace(matcher.start(1), matcher.end(1), link);
176                    }
177    
178                    return TABLE_OF_CONTENTS + super.postProcess(sb.toString());
179            }
180    
181            private static final String[] _HTML_TAGS = {
182                    "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />",  "<center>",
183                    "</center>", "<cite>", "</cite>","<code>", "</code>", "<div[^>]*>",
184                    "</div>", "<font[^>]*>", "</font>", "<hr>", "<hr/>", "<hr />", "<p>",
185                    "</p>", "<tt>", "</tt>", "<var>", "</var>"};
186    
187    }