001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portlet.wiki.translators;
016    
017    import com.liferay.portal.kernel.util.StringPool;
018    import com.liferay.portal.kernel.util.StringUtil;
019    import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter;
020    
021    import java.util.regex.Matcher;
022    import java.util.regex.Pattern;
023    
024    /**
025     * @author Jorge Ferrer
026     * @author Daniel Kocsis
027     */
028    public class MediaWikiToCreoleTranslator extends BaseTranslator {
029    
030            public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n";
031    
032            public MediaWikiToCreoleTranslator() {
033                    initRegexps();
034                    initNowikiRegexps();
035            }
036    
037            public boolean isStrictImportMode() {
038                    return _strictImportMode;
039            }
040    
041            public void setStrictImportMode(boolean strictImportMode) {
042                    _strictImportMode = strictImportMode;
043            }
044    
045            protected void initNowikiRegexps() {
046    
047                    // Preformat protected
048    
049                    nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)");
050                    nowikiRegexps.add("(<pre>)(.*?)(</pre>)");
051    
052                    // Escape protected
053    
054                    nowikiRegexps.add(
055                            "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)");
056            }
057    
058            protected void initRegexps() {
059    
060                    // Clean unnecessary header emphasis
061    
062                    regexps.put("= '''([^=]+)''' =", "= $1 =");
063                    regexps.put("== '''([^=]+)''' ==", "== $1 ==");
064                    regexps.put("== '''([^=]+)''' ===", "=== $1 ===");
065    
066                    // Unscape angle brackets
067    
068                    regexps.put("&lt;", "<");
069                    regexps.put("&gt;", ">");
070    
071                    // Remove categories
072    
073                    regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", "");
074    
075                    // Remove disambiguations
076    
077                    regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK);
078    
079                    // Remove work in progress
080    
081                    regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK);
082    
083                    // Remove references
084    
085                    regexps.put("\\[{2}Wikipedia:([^\\]]*)\\]{2}", StringPool.BLANK);
086    
087                    // Bold and italics
088    
089                    regexps.put(
090                            "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3");
091    
092                    // Bold
093    
094                    regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3");
095    
096                    // Italics
097    
098                    regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3");
099    
100                    // Normalize URLs
101    
102                    regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]");
103    
104                    // URL
105    
106                    regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]");
107    
108                    // URL with label
109    
110                    regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]");
111    
112                    // Term and definition
113    
114                    regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2");
115    
116                    // Indented paragraph
117    
118                    regexps.put("^\\t:\\t(.*)", "$1");
119    
120                    // Monospace
121    
122                    regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}");
123    
124                    // No wiki
125    
126                    regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}");
127    
128                    // HTML PRE
129    
130                    regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}");
131    
132                    // User reference
133    
134                    regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1");
135            }
136    
137            @Override
138            protected String postProcess(String content) {
139                    if (_strictImportMode) {
140                            content = runRegexp(
141                                    content, "\\{{2}Special:(.*?)\\}{2}", StringPool.BLANK);
142                            content = runRegexp(content, "\\{{2}(.*?)\\}{2}", StringPool.BLANK);
143                            content = runRegexp(
144                                    content, "(?s)\\{{2}(.*?)\\}{2}", StringPool.BLANK);
145                    }
146                    else {
147                            content = runRegexp(
148                                    content, "\\{{2}Special:(.*?)\\}{2}", "{{{$1}}}\n");
149                            content = runRegexp(content, "\\{{2}(.*?)\\}{2}", "{{{$1}}}");
150                            content = runRegexp(
151                                    content, "([^\\{])(\\{{2})([^\\{])", "$1\n{{{\n$3");
152                            content = runRegexp(
153                                    content, "([^\\}])(\\}{2})([^\\}])", "$1\n}}}\n$3");
154                    }
155    
156                    // LEP-6118
157    
158                    Pattern pattern = Pattern.compile("^=([^=]+)=", Pattern.MULTILINE);
159    
160                    Matcher matcher = pattern.matcher(content);
161    
162                    if (matcher.find()) {
163                            content = runRegexp(content, "^===([^=]+)===", "====$1====");
164                            content = runRegexp(content, "^==([^=]+)==", "===$1===");
165                            content = runRegexp(content, "^=([^=]+)=", "==$1==");
166                    }
167    
168                    // Remove HTML tags
169    
170                    for (int i = 0; i < _HTML_TAGS.length; i++) {
171                            content = content.replaceAll(_HTML_TAGS[i], StringPool.BLANK);
172                    }
173    
174                    // Images
175    
176                    pattern = Pattern.compile("(\\[{2})(Image|File)(:)", Pattern.DOTALL);
177    
178                    matcher = pattern.matcher(content);
179    
180                    StringBuffer sb = new StringBuffer(content);
181    
182                    int level = 0;
183                    int offset = 0;
184                    int originalLength = 0;
185                    int prefixLength = 0;
186    
187                    while (matcher.find()) {
188                            level = 0;
189                            prefixLength = matcher.end(2) - matcher.start(2);
190    
191                            for (int i = matcher.start(0) + offset; i < sb.length() - 1; i++) {
192                                    if ((sb.charAt(i) == '[') && (sb.charAt(i + 1) == '[')) {
193                                            level++;
194                                    }
195                                    else if ((sb.charAt(i) == ']') && (sb.charAt(i + 1) == ']')) {
196                                            level--;
197    
198                                            if (level == 0) {
199                                                    originalLength = (i + 2) - (matcher.start(0) + offset);
200    
201                                                    break;
202                                            }
203                                    }
204                            }
205    
206                            int imageStartPos = matcher.end(3) + offset;
207                            int imageEndPos = matcher.start(2) + offset + originalLength - 4;
208    
209                            String image =
210                                    "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" +
211                                            StringUtil.toLowerCase(
212                                                    sb.substring(imageStartPos, imageEndPos)) +
213                                                            "}}";
214    
215                            int imageLength = image.length();
216    
217                            image = image.replaceAll("\\[{2}", StringPool.BLANK);
218                            image = image.replaceAll("\\]{2}", StringPool.BLANK);
219    
220                            sb.replace(
221                                    matcher.start(0) + offset,
222                                    matcher.start(0) + originalLength + offset, image);
223    
224                            offset +=
225                                    MediaWikiImporter.SHARED_IMAGES_TITLE.length() - prefixLength -
226                                            (imageLength - image.length());
227                    }
228    
229                    content = sb.toString();
230    
231                    // Tables
232    
233                    pattern = Pattern.compile("\\{\\|(.*?)\\|\\}", Pattern.DOTALL);
234    
235                    matcher = pattern.matcher(content);
236    
237                    sb = new StringBuffer(content);
238    
239                    String mediaWikiTable = null;
240    
241                    offset = 0;
242                    originalLength = 0;
243    
244                    while (matcher.find()) {
245                            mediaWikiTable = sb.substring(
246                                    matcher.start(1) + offset, matcher.end(1) + offset);
247    
248                            originalLength = mediaWikiTable.length() + 4;
249    
250                            mediaWikiTable = mediaWikiTable.replaceAll(
251                                    "class=(.*?)[|\n\r]", StringPool.BLANK);
252                            mediaWikiTable = mediaWikiTable.replaceAll("(\\|\\-)(.*)", "$1");
253                            mediaWikiTable = mediaWikiTable.replaceAll(
254                                    "\\|\\+(.*)", "===$1===");
255                            mediaWikiTable = mediaWikiTable.replaceAll("(?m)^!(.+)", "|=$1|");
256                            mediaWikiTable = mediaWikiTable.replaceAll(
257                                    "[\n\r]", StringPool.BLANK);
258                            mediaWikiTable = mediaWikiTable.replaceAll("\\|\\-", "\n\r");
259                            mediaWikiTable = mediaWikiTable.replaceAll("\\|\\|", "|");
260                            mediaWikiTable = mediaWikiTable.replaceAll(
261                                    "/{4}", StringPool.BLANK);
262    
263                            sb.replace(
264                                    matcher.start(0) + offset,
265                                    matcher.start(0) + originalLength + offset, mediaWikiTable);
266    
267                            offset += mediaWikiTable.length() - originalLength;
268                    }
269    
270                    content = sb.toString();
271    
272                    content = runRegexp(content, "/{2}(\\{{3})", "$1");
273                    content = runRegexp(content, "(\\}{3})/{2}", "$1");
274    
275                    // Remove underscores from links
276    
277                    pattern = Pattern.compile("\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL);
278    
279                    matcher = pattern.matcher(content);
280    
281                    sb = new StringBuffer(content);
282    
283                    while (matcher.find()) {
284                            String link = matcher.group(1).replace(
285                                    StringPool.UNDERLINE, StringPool.SPACE);
286    
287                            sb.replace(matcher.start(1), matcher.end(1), link);
288                    }
289    
290                    return TABLE_OF_CONTENTS + super.postProcess(sb.toString());
291            }
292    
293            private static final String[] _HTML_TAGS = {
294                    "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />", "<center>",
295                    "</center>", "<cite>", "</cite>","<code>", "</code>", "<div[^>]*>",
296                    "</div>", "<font[^>]*>", "</font>", "<hr>", "<hr/>", "<hr />", "<p>",
297                    "</p>", "<tt>", "</tt>", "<var>", "</var>"
298            };
299    
300            private boolean _strictImportMode;
301    
302    }