001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portlet.wiki.translators;
016    
017    import com.liferay.portal.kernel.util.StringPool;
018    import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter;
019    
020    import java.util.regex.Matcher;
021    import java.util.regex.Pattern;
022    
023    /**
024     * @author Jorge Ferrer
025     * @author Daniel Kocsis
026     */
027    public class MediaWikiToCreoleTranslator extends BaseTranslator {
028    
029            public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n";
030    
031            public MediaWikiToCreoleTranslator() {
032                    initRegexps();
033                    initNowikiRegexps();
034            }
035    
036            public boolean isStrictImportMode() {
037                    return _strictImportMode;
038            }
039    
040            public void setStrictImportMode(boolean strictImportMode) {
041                    _strictImportMode = strictImportMode;
042            }
043    
044            protected void initNowikiRegexps() {
045    
046                    // Preformat protected
047    
048                    nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)");
049                    nowikiRegexps.add("(<pre>)(.*?)(</pre>)");
050    
051                    // Escape protected
052    
053                    nowikiRegexps.add(
054                            "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)");
055            }
056    
057            protected void initRegexps() {
058    
059                    // Clean unnecessary header emphasis
060    
061                    regexps.put("= '''([^=]+)''' =", "= $1 =");
062                    regexps.put("== '''([^=]+)''' ==", "== $1 ==");
063                    regexps.put("== '''([^=]+)''' ===", "=== $1 ===");
064    
065                    // Unscape angle brackets
066    
067                    regexps.put("&lt;", "<");
068                    regexps.put("&gt;", ">");
069    
070                    // Remove categories
071    
072                    regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", "");
073    
074                    // Remove disambiguations
075    
076                    regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK);
077    
078                    // Remove work in progress
079    
080                    regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK);
081    
082                    // Remove references
083    
084                    regexps.put("\\[{2}Wikipedia:([^\\]]*)\\]{2}", StringPool.BLANK);
085    
086                    // Bold and italics
087    
088                    regexps.put(
089                            "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3");
090    
091                    // Bold
092    
093                    regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3");
094    
095                    // Italics
096    
097                    regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3");
098    
099                    // Normalize URLs
100    
101                    regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]");
102    
103                    // URL
104    
105                    regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]");
106    
107                    // URL with label
108    
109                    regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]");
110    
111                    // Term and definition
112    
113                    regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2");
114    
115                    // Indented paragraph
116    
117                    regexps.put("^\\t:\\t(.*)", "$1");
118    
119                    // Monospace
120    
121                    regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}");
122    
123                    // No wiki
124    
125                    regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}");
126    
127                    // HTML PRE
128    
129                    regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}");
130    
131                    // User reference
132    
133                    regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1");
134            }
135    
136            @Override
137            protected String postProcess(String content) {
138                    if (_strictImportMode) {
139                            content = runRegexp(
140                                    content, "\\{{2}Special:(.*?)\\}{2}", StringPool.BLANK);
141                            content = runRegexp(content, "\\{{2}(.*?)\\}{2}", StringPool.BLANK);
142                            content = runRegexp(
143                                    content, "(?s)\\{{2}(.*?)\\}{2}", StringPool.BLANK);
144                    }
145                    else {
146                            content = runRegexp(
147                                    content, "\\{{2}Special:(.*?)\\}{2}", "{{{$1}}}\n");
148                            content = runRegexp(content, "\\{{2}(.*?)\\}{2}", "{{{$1}}}");
149                            content = runRegexp(
150                                    content, "([^\\{])(\\{{2})([^\\{])", "$1\n{{{\n$3");
151                            content = runRegexp(
152                                    content, "([^\\}])(\\}{2})([^\\}])", "$1\n}}}\n$3");
153                    }
154    
155                    // LEP-6118
156    
157                    Matcher matcher = Pattern.compile(
158                            "^=([^=]+)=", Pattern.MULTILINE).matcher(content);
159    
160                    if (matcher.find()) {
161                            content = runRegexp(content, "^===([^=]+)===", "====$1====");
162                            content = runRegexp(content, "^==([^=]+)==", "===$1===");
163                            content = runRegexp(content, "^=([^=]+)=", "==$1==");
164                    }
165    
166                    // Remove HTML tags
167    
168                    for (int i = 0; i < _HTML_TAGS.length; i++) {
169                            content = content.replaceAll(_HTML_TAGS[i], StringPool.BLANK);
170                    }
171    
172                    // Images
173    
174                    matcher = Pattern.compile(
175                            "(\\[{2})(Image|File)(:)", Pattern.DOTALL).matcher(content);
176    
177                    StringBuffer sb = new StringBuffer(content);
178    
179                    int level = 0;
180                    int offset = 0;
181                    int originalLength = 0;
182                    int prefixLength = 0;
183    
184                    while (matcher.find()) {
185                            level = 0;
186                            prefixLength = matcher.end(2) - matcher.start(2);
187    
188                            for (int i = matcher.start(0) + offset; i < sb.length() - 1; i++) {
189                                    if ((sb.charAt(i) == '[') && (sb.charAt(i + 1) == '[')) {
190                                            level++;
191                                    }
192                                    else if ((sb.charAt(i) == ']') && (sb.charAt(i + 1) == ']')) {
193                                            level--;
194    
195                                            if (level == 0) {
196                                                    originalLength = (i + 2) - (matcher.start(0) + offset);
197    
198                                                    break;
199                                            }
200                                    }
201                            }
202    
203                            int imageStartPos = matcher.end(3) + offset;
204                            int imageEndPos = matcher.start(2) + offset + originalLength - 4;
205    
206                            String image =
207                                    "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" +
208                                            sb.substring(imageStartPos, imageEndPos).toLowerCase() +
209                                                    "}}";
210    
211                            int imageLength = image.length();
212    
213                            image = image.replaceAll("\\[{2}", StringPool.BLANK);
214                            image = image.replaceAll("\\]{2}", StringPool.BLANK);
215    
216                            sb.replace(
217                                    matcher.start(0) + offset,
218                                    matcher.start(0) + originalLength + offset, image);
219    
220                            offset +=
221                                    MediaWikiImporter.SHARED_IMAGES_TITLE.length() - prefixLength -
222                                            (imageLength - image.length());
223                    }
224    
225                    content = sb.toString();
226    
227                    // Tables
228    
229                    matcher = Pattern.compile(
230                            "\\{\\|(.*?)\\|\\}", Pattern.DOTALL).matcher(content);
231    
232                    sb = new StringBuffer(content);
233    
234                    String mediaWikiTable = null;
235    
236                    offset = 0;
237                    originalLength = 0;
238    
239                    while (matcher.find()) {
240                            mediaWikiTable = sb.substring(
241                                    matcher.start(1) + offset, matcher.end(1) + offset);
242    
243                            originalLength = mediaWikiTable.length() + 4;
244    
245                            mediaWikiTable = mediaWikiTable.replaceAll(
246                                    "class=(.*?)[|\n\r]", StringPool.BLANK);
247                            mediaWikiTable = mediaWikiTable.replaceAll("(\\|\\-)(.*)", "$1");
248                            mediaWikiTable = mediaWikiTable.replaceAll(
249                                    "\\|\\+(.*)", "===$1===");
250                            mediaWikiTable = mediaWikiTable.replaceAll("(?m)^!(.+)", "|=$1|");
251                            mediaWikiTable = mediaWikiTable.replaceAll(
252                                    "[\n\r]", StringPool.BLANK);
253                            mediaWikiTable = mediaWikiTable.replaceAll("\\|\\-", "\n\r");
254                            mediaWikiTable = mediaWikiTable.replaceAll("\\|\\|", "|");
255                            mediaWikiTable = mediaWikiTable.replaceAll(
256                                    "/{4}", StringPool.BLANK);
257    
258                            sb.replace(
259                                    matcher.start(0) + offset,
260                                    matcher.start(0) + originalLength + offset, mediaWikiTable);
261    
262                            offset = mediaWikiTable.length() - originalLength;
263                    }
264    
265                    content = sb.toString();
266    
267                    content = runRegexp(content, "/{2}(\\{{3})", "$1");
268                    content = runRegexp(content, "(\\}{3})/{2}", "$1");
269    
270                    // Remove underscores from links
271    
272                    matcher = Pattern.compile(
273                            "\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content);
274    
275                    sb = new StringBuffer(content);
276    
277                    while (matcher.find()) {
278                            String link = matcher.group(1).replace(
279                                    StringPool.UNDERLINE, StringPool.SPACE);
280    
281                            sb.replace(matcher.start(1), matcher.end(1), link);
282                    }
283    
284                    return TABLE_OF_CONTENTS + super.postProcess(sb.toString());
285            }
286    
287            private static final String[] _HTML_TAGS = {
288                    "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />", "<center>",
289                    "</center>", "<cite>", "</cite>","<code>", "</code>", "<div[^>]*>",
290                    "</div>", "<font[^>]*>", "</font>", "<hr>", "<hr/>", "<hr />", "<p>",
291                    "</p>", "<tt>", "</tt>", "<var>", "</var>"
292            };
293    
294            private boolean _strictImportMode;
295    
296    }