Skip navigation.
Home

Reply to comment

Utlity class for removing diacritics.

import java.util.*;

/**
* A utility class that removes diacritics from characters. For example, czech
* příliš žluťoučký kůň úplěl ďábelské ódy becomes prilis zlutoucky
* kun upel dabelske ody.
* Data extracted from UnicodeData.txt.

* Legal stuff:
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Diacritics.java.
*
* The Initial Developer of the Original Code is Petr Pudlák.
*
* Contributor(s):
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General
* Public License Version 3
or later (the "GPL"), or the GNU Lesser
* General Public License Version 3
or later (the "LGPL"), in which case
* the provisions of the GPL or the LGPL are applicable instead of those
* above. If you wish to allow use of your version of this file only under the
* terms of either the GPL or the LGPL, and not to allow others to use your
* version of this file under the terms of the MPL, indicate your decision by
* deleting the provisions above and replace them with the notice and other
* provisions required by the GPL or the LGPL. If you do not delete the
* provisions above, a recipient may use your version of this file under the
* terms of any one of the MPL, the GPL or the LGPL.
*
*
*/
public class Diacritics
{
private static final Map filterMap;

private Diacritics()
{
}

public static char filter(char c)
{
final Character c2 = (Character)filterMap.get(new Character(c));
if (c2 != null)
return c2.charValue();
else
return c;
}

public static String filter(String s)
{
if (s == null)
return null;
final int l = s.length();
final StringBuffer sb = new StringBuffer(l);
for(int i = 0; i < l; i++)
sb.append(filter(s.charAt(i)));
return sb.toString();
}

private final static String FROM_CHARS = "\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00C7\u00C8\u00C9\u00CA\u00CB\u00CC\u00CD\u00CE\u00CF\u00D1\u00D2\u00D3\u00D4\u00D5\u00D6\u00D9\u00DA\u00DB\u00DC\u00DD\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u00E7\u00E8\u00E9\u00EA\u00EB\u00EC\u00ED\u00EE\u00EF\u00F1\u00F2\u00F3\u00F4\u00F5\u00F6\u00F9\u00FA\u00FB\u00FC\u00FD\u00FF\u0100\u0101\u0102\u0103\u0104\u0105\u0106\u0107\u0108\u0109\u010A\u010B\u010C\u010D\u010E\u010F\u0112\u0113\u0114\u0115\u0116\u0117\u0118\u0119\u011A\u011B\u011C\u011D\u011E\u011F\u0120\u0121\u0122\u0123\u0124\u0125\u0128\u0129\u012A\u012B\u012C\u012D\u012E\u012F\u0130\u0132\u0133\u0134\u0135\u0136\u0137\u0139\u013A\u013B\u013C\u013D\u013E\u013F\u0140\u0143\u0144\u0145\u0146\u0147\u0148\u0149\u014C\u014D\u014E\u014F\u0150\u0151\u0154\u0155\u0156\u0157\u0158\u0159\u015A\u015B\u015C\u015D\u015E\u015F\u0160\u0161\u0162\u0163\u0164\u0165\u0168\u0169\u016A\u016B\u016C\u016D\u016E\u016F\u0170\u0171\u0172\u0173\u0174\u0175\u0176\u0177\u0178\u0179\u017A\u017B\u017C\u017D\u017E\u017F\u01A0\u01A1\u01AF\u01B0\u01C4\u01C5\u01C6\u01C7\u01C8\u01C9\u01CA\u01CB\u01CC\u01CD\u01CE\u01CF\u01D0\u01D1\u01D2\u01D3\u01D4\u01D5\u01D6\u01D7\u01D8\u01D9\u01DA\u01DB\u01DC\u01DE\u01DF\u01E0\u01E1\u01E2\u01E3\u01E6\u01E7\u01E8\u01E9\u01EA\u01EB\u01EC\u01ED\u01EE\u01EF\u01F0\u01F1\u01F2\u01F3\u01F4\u01F5\u01F8\u01F9\u01FA\u01FB\u01FC\u01FD\u01FE\u01FF\u0200\u0201\u0202\u0203\u0204\u0205\u0206\u0207\u0208\u0209\u020A\u020B\u020C\u020D\u020E\u020F\u0210\u0211\u0212\u0213\u0214\u0215\u0216\u0217\u0218\u0219\u021A\u021B\u021E\u021F\u0226\u0227\u0228\u0229\u022A\u022B\u022C\u022D\u022E\u022F\u0230\u0231\u0232\u0233\u1D62\u1D63\u1D64\u1D65\u1E00\u1E01\u1E02\u1E03\u1E04\u1E05\u1E06\u1E07\u1E08\u1E09\u1E0A\u1E0B\u1E0C\u1E0D\u1E0E\u1E0F\u1E10\u1E11\u1E12\u1E13\u1E14\u1E15\u1E16\u1E17\u1E18\u1E19\u1E1A\u1E1B\u1E1C\u1E1D\u1E1E\u1E1F\u1E20\u1E21\u1E22\u1E23\u1E24\u1E25\u1E26\u1E27\u1E28\u1E29\u1E2A\u1E2B\u1E2C\u1E2D\u1E2E\u1E2F\u1E30\u1E31\u1E32\u1E33\u1E34\u1E35\u1E36\u1E37\u1E38\u1E39\u1E3A\u1E3B\u1E3C\u1E3D\u1E3E\u1E3F\u1E40\u1E41\u1E42\u1E43\u1E44\u1E45\u1E46\u1E47\u1E48\u1E49\u1E4A\u1E4B\u1E4C\u1E4D\u1E4E\u1E4F\u1E50\u1E51\u1E52\u1E53\u1E54\u1E55\u1E56\u1E57\u1E58\u1E59\u1E5A\u1E5B\u1E5C\u1E5D\u1E5E\u1E5F\u1E60\u1E61\u1E62\u1E63\u1E64\u1E65\u1E66\u1E67\u1E68\u1E69\u1E6A\u1E6B\u1E6C\u1E6D\u1E6E\u1E6F\u1E70\u1E71\u1E72\u1E73\u1E74\u1E75\u1E76\u1E77\u1E78\u1E79\u1E7A\u1E7B\u1E7C\u1E7D\u1E7E\u1E7F\u1E80\u1E81\u1E82\u1E83\u1E84\u1E85\u1E86\u1E87\u1E88\u1E89\u1E8A\u1E8B\u1E8C\u1E8D\u1E8E\u1E8F\u1E90\u1E91\u1E92\u1E93\u1E94\u1E95\u1E96\u1E97\u1E98\u1E99\u1E9A\u1E9B\u1EA0\u1EA1\u1EA2\u1EA3\u1EA4\u1EA5\u1EA6\u1EA7\u1EA8\u1EA9\u1EAA\u1EAB\u1EAC\u1EAD\u1EAE\u1EAF\u1EB0\u1EB1\u1EB2\u1EB3\u1EB4\u1EB5\u1EB6\u1EB7\u1EB8\u1EB9\u1EBA\u1EBB\u1EBC\u1EBD\u1EBE\u1EBF\u1EC0\u1EC1\u1EC2\u1EC3\u1EC4\u1EC5\u1EC6\u1EC7\u1EC8\u1EC9\u1ECA\u1ECB\u1ECC\u1ECD\u1ECE\u1ECF\u1ED0\u1ED1\u1ED2\u1ED3\u1ED4\u1ED5\u1ED6\u1ED7\u1ED8\u1ED9\u1EDA\u1EDB\u1EDC\u1EDD\u1EDE\u1EDF\u1EE0\u1EE1\u1EE2\u1EE3\u1EE4\u1EE5\u1EE6\u1EE7\u1EE8\u1EE9\u1EEA\u1EEB\u1EEC\u1EED\u1EEE\u1EEF\u1EF0\u1EF1\u1EF2\u1EF3\u1EF4\u1EF5\u1EF6\u1EF7\u1EF8\u1EF9\u2090\u2091\u2092\u2093\u2094\u2C7C\uFB00\uFB01\uFB02\uFB03\uFB04\uFB05\uFB06";
private static final String TO_CHARS = "\u0041\u0041\u0041\u0041\u0041\u0041\u0043\u0045\u0045\u0045\u0045\u0049\u0049\u0049\u0049\u004E\u004F\u004F\u004F\u004F\u004F\u0055\u0055\u0055\u0055\u0059\u0061\u0061\u0061\u0061\u0061\u0061\u0063\u0065\u0065\u0065\u0065\u0069\u0069\u0069\u0069\u006E\u006F\u006F\u006F\u006F\u006F\u0075\u0075\u0075\u0075\u0079\u0079\u0041\u0061\u0041\u0061\u0041\u0061\u0043\u0063\u0043\u0063\u0043\u0063\u0043\u0063\u0044\u0064\u0045\u0065\u0045\u0065\u0045\u0065\u0045\u0065\u0045\u0065\u0047\u0067\u0047\u0067\u0047\u0067\u0047\u0067\u0048\u0068\u0049\u0069\u0049\u0069\u0049\u0069\u0049\u0069\u0049\u0049\u0069\u004A\u006A\u004B\u006B\u004C\u006C\u004C\u006C\u004C\u006C\u004C\u006C\u004E\u006E\u004E\u006E\u004E\u006E\u02BC\u004F\u006F\u004F\u006F\u004F\u006F\u0052\u0072\u0052\u0072\u0052\u0072\u0053\u0073\u0053\u0073\u0053\u0073\u0053\u0073\u0054\u0074\u0054\u0074\u0055\u0075\u0055\u0075\u0055\u0075\u0055\u0075\u0055\u0075\u0055\u0075\u0057\u0077\u0059\u0079\u0059\u005A\u007A\u005A\u007A\u005A\u007A\u0073\u004F\u006F\u0055\u0075\u0044\u0044\u0064\u004C\u004C\u006C\u004E\u004E\u006E\u0041\u0061\u0049\u0069\u004F\u006F\u0055\u0075\u00DC\u00FC\u00DC\u00FC\u00DC\u00FC\u00DC\u00FC\u00C4\u00E4\u0226\u0227\u00C6\u00E6\u0047\u0067\u004B\u006B\u004F\u006F\u01EA\u01EB\u01B7\u0292\u006A\u0044\u0044\u0064\u0047\u0067\u004E\u006E\u00C5\u00E5\u00C6\u00E6\u00D8\u00F8\u0041\u0061\u0041\u0061\u0045\u0065\u0045\u0065\u0049\u0069\u0049\u0069\u004F\u006F\u004F\u006F\u0052\u0072\u0052\u0072\u0055\u0075\u0055\u0075\u0053\u0073\u0054\u0074\u0048\u0068\u0041\u0061\u0045\u0065\u00D6\u00F6\u00D5\u00F5\u004F\u006F\u022E\u022F\u0059\u0079\u0069\u0072\u0075\u0076\u0041\u0061\u0042\u0062\u0042\u0062\u0042\u0062\u00C7\u00E7\u0044\u0064\u0044\u0064\u0044\u0064\u0044\u0064\u0044\u0064\u0112\u0113\u0112\u0113\u0045\u0065\u0045\u0065\u0228\u0229\u0046\u0066\u0047\u0067\u0048\u0068\u0048\u0068\u0048\u0068\u0048\u0068\u0048\u0068\u0049\u0069\u00CF\u00EF\u004B\u006B\u004B\u006B\u004B\u006B\u004C\u006C\u1E36\u1E37\u004C\u006C\u004C\u006C\u004D\u006D\u004D\u006D\u004D\u006D\u004E\u006E\u004E\u006E\u004E\u006E\u004E\u006E\u00D5\u00F5\u00D5\u00F5\u014C\u014D\u014C\u014D\u0050\u0070\u0050\u0070\u0052\u0072\u0052\u0072\u1E5A\u1E5B\u0052\u0072\u0053\u0073\u0053\u0073\u015A\u015B\u0160\u0161\u1E62\u1E63\u0054\u0074\u0054\u0074\u0054\u0074\u0054\u0074\u0055\u0075\u0055\u0075\u0055\u0075\u0168\u0169\u016A\u016B\u0056\u0076\u0056\u0076\u0057\u0077\u0057\u0077\u0057\u0077\u0057\u0077\u0057\u0077\u0058\u0078\u0058\u0078\u0059\u0079\u005A\u007A\u005A\u007A\u005A\u007A\u0068\u0074\u0077\u0079\u0061\u017F\u0041\u0061\u0041\u0061\u00C2\u00E2\u00C2\u00E2\u00C2\u00E2\u00C2\u00E2\u1EA0\u1EA1\u0102\u0103\u0102\u0103\u0102\u0103\u0102\u0103\u1EA0\u1EA1\u0045\u0065\u0045\u0065\u0045\u0065\u00CA\u00EA\u00CA\u00EA\u00CA\u00EA\u00CA\u00EA\u1EB8\u1EB9\u0049\u0069\u0049\u0069\u004F\u006F\u004F\u006F\u00D4\u00F4\u00D4\u00F4\u00D4\u00F4\u00D4\u00F4\u1ECC\u1ECD\u01A0\u01A1\u01A0\u01A1\u01A0\u01A1\u01A0\u01A1\u01A0\u01A1\u0055\u0075\u0055\u0075\u01AF\u01B0\u01AF\u01B0\u01AF\u01B0\u01AF\u01B0\u01AF\u01B0\u0059\u0079\u0059\u0079\u0059\u0079\u0059\u0079\u0061\u0065\u006F\u0078\u0259\u006A\u0066\u0066\u0066\u0066\u0066\u017F\u0073";

static
{
final int l = FROM_CHARS.length();
filterMap = new HashMap(FROM_CHARS.length());
for(int i = 0; i < l; i++)
filterMap.put(new Character(FROM_CHARS.charAt(i)), new Character(TO_CHARS.charAt(i)));
}
}

AttachmentSize
Diacritics.java9.24 KB

Reply

  • Web page addresses and e-mail addresses turn into links automatically.
  • Allowed HTML tags: <a> <em> <strong> <cite> <code> <ul> <ol> <li> <dl> <dt> <dd>
  • Lines and paragraphs break automatically.

More information about formatting options