001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.codec.language; 018 019import java.util.Locale; 020 021import org.apache.commons.codec.EncoderException; 022import org.apache.commons.codec.StringEncoder; 023 024/** 025 * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977. 026 * <p> 027 * This class is immutable and thread-safe. 028 * </p> 029 * 030 * @see <a href="https://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a> 031 * @since 1.8 032 */ 033public class MatchRatingApproachEncoder implements StringEncoder { 034 035 private static final String SPACE = " "; 036 037 private static final String EMPTY = ""; 038 039 /** 040 * The plain letter equivalent of the accented letters. 041 */ 042 private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave 043 "AaEeIiOoUuYy" + // acute 044 "AaEeIiOoUuYy" + // circumflex 045 "AaOoNn" + // tilde 046 "AaEeIiOoUuYy" + // umlaut 047 "Aa" + // ring 048 "Cc" + // cedilla 049 "OoUu"; // double acute 050 051 /** 052 * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc... 053 */ 054 private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" + 055 "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" + 056 "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" + 057 "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF\u00C5\u00E5\u00C7\u00E7\u0150\u0151\u0170\u0171"; 058 059 /** 060 * Double consonants. 061 */ 062 private static final String[] DOUBLE_CONSONANT = 063 { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS", 064 "TT", "VV", "WW", "XX", "YY", "ZZ" }; 065 066 /** 067 * Constructs a new instance. 068 */ 069 public MatchRatingApproachEncoder() { 070 // empty 071 } 072 073 /** 074 * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any 075 * spaces. 076 * 077 * <h2>API Usage</h2> 078 * <p> 079 * Consider this method private, it is package protected for unit testing only. 080 * </p> 081 * 082 * @param name 083 * The name to be cleaned. 084 * @return The cleaned name. 085 */ 086 String cleanName(final String name) { 087 String upperName = name.toUpperCase(Locale.ENGLISH); 088 089 final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" }; 090 for (final String str : charsToTrim) { 091 upperName = upperName.replaceAll(str, EMPTY); 092 } 093 094 upperName = removeAccents(upperName); 095 return upperName.replaceAll("\\s+", EMPTY); 096 } 097 098 /** 099 * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the 100 * Encoder interface Throws an EncoderException if input object is not of type {@link String}. 101 * 102 * @param object 103 * Object to encode. 104 * @return An object (or type {@link String}) containing the Match Rating Approach code which corresponds to the 105 * String supplied. 106 * @throws EncoderException 107 * if the parameter supplied is not of type {@link String}. 108 */ 109 @Override 110 public final Object encode(final Object object) throws EncoderException { 111 if (!(object instanceof String)) { 112 throw new EncoderException( 113 "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String"); 114 } 115 return encode((String) object); 116 } 117 118 /** 119 * Encodes a String using the Match Rating Approach (MRA) algorithm. 120 * 121 * @param name 122 * String object to encode. 123 * @return The MRA code corresponding to the String supplied. 124 */ 125 @Override 126 public final String encode(String name) { 127 // Bulletproof for trivial input - NINO 128 if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) { 129 return EMPTY; 130 } 131 132 // Preprocessing 133 name = cleanName(name); 134 135 // Bulletproof if name becomes empty after cleanName(name) 136 if (SPACE.equals(name) || name.isEmpty()) { 137 return EMPTY; 138 } 139 140 // BEGIN: Actual encoding part of the algorithm... 141 // 1. Delete all vowels unless the vowel begins the word 142 name = removeVowels(name); 143 144 // Bulletproof if name becomes empty after removeVowels(name) 145 if (SPACE.equals(name) || name.isEmpty()) { 146 return EMPTY; 147 } 148 149 // 2. Remove second consonant from any double consonant 150 name = removeDoubleConsonants(name); 151 152 return getFirst3Last3(name); 153 } 154 155 /** 156 * Gets the first and last 3 letters of a name (if > 6 characters) Else just returns the name. 157 * 158 * <h2>API Usage</h2> 159 * <p> 160 * Consider this method private, it is package protected for unit testing only. 161 * </p> 162 * 163 * @param name 164 * The string to get the substrings from. 165 * @return Annexed first and last 3 letters of input word. 166 */ 167 String getFirst3Last3(final String name) { 168 final int nameLength = name.length(); 169 170 if (nameLength > 6) { 171 final String firstThree = name.substring(0, 3); 172 final String lastThree = name.substring(nameLength - 3, nameLength); 173 return firstThree + lastThree; 174 } 175 return name; 176 } 177 178 /** 179 * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the 180 * min rating. Values strictly from documentation. 181 * 182 * <h2>API Usage</h2> 183 * <p> 184 * Consider this method private, it is package protected for unit testing only. 185 * </p> 186 * 187 * @param sumLength 188 * The length of 2 strings sent down. 189 * @return The min rating value. 190 */ 191 int getMinRating(final int sumLength) { 192 int minRating = 0; 193 194 if (sumLength <= 4) { 195 minRating = 5; 196 } else if (sumLength <= 7) { // already know it is at least 5 197 minRating = 4; 198 } else if (sumLength <= 11) { // already know it is at least 8 199 minRating = 3; 200 } else if (sumLength == 12) { 201 minRating = 2; 202 } else { 203 minRating = 1; // docs said little here. 204 } 205 206 return minRating; 207 } 208 209 /** 210 * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the 211 * strings are cleaned in the same way as {@link #encode(String)}. 212 * 213 * @param name1 214 * First of the 2 strings (names) to compare. 215 * @param name2 216 * Second of the 2 names to compare. 217 * @return {@code true} if the encodings are identical {@code false} otherwise. 218 */ 219 public boolean isEncodeEquals(String name1, String name2) { 220 // Bulletproof for trivial input - NINO 221 if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) { 222 return false; 223 } 224 if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) { 225 return false; 226 } 227 if (name1.length() == 1 || name2.length() == 1) { 228 return false; 229 } 230 if (name1.equalsIgnoreCase(name2)) { 231 return true; 232 } 233 234 // Preprocessing 235 name1 = cleanName(name1); 236 name2 = cleanName(name2); 237 238 // Actual MRA Algorithm 239 240 // 1. Remove vowels 241 name1 = removeVowels(name1); 242 name2 = removeVowels(name2); 243 244 // 2. Remove double consonants 245 name1 = removeDoubleConsonants(name1); 246 name2 = removeDoubleConsonants(name2); 247 248 // 3. Reduce down to 3 letters 249 name1 = getFirst3Last3(name1); 250 name2 = getFirst3Last3(name2); 251 252 // 4. Check for length difference - if 3 or greater, then no similarity 253 // comparison is done 254 if (Math.abs(name1.length() - name2.length()) >= 3) { 255 return false; 256 } 257 258 // 5. Obtain the minimum rating value by calculating the length sum of the 259 // encoded Strings and sending it down. 260 final int sumLength = Math.abs(name1.length() + name2.length()); 261 final int minRating = getMinRating(sumLength); 262 263 // 6. Process the encoded Strings from left to right and remove any 264 // identical characters found from both Strings respectively. 265 final int count = leftToRightThenRightToLeftProcessing(name1, name2); 266 267 // 7. Each PNI item that has a similarity rating equal to or greater than 268 // the min is considered to be a good candidate match 269 return count >= minRating; 270 271 } 272 273 /** 274 * Determines if a letter is a vowel. 275 * 276 * <h2>API Usage</h2> 277 * <p> 278 * Consider this method private, it is package protected for unit testing only. 279 * </p> 280 * 281 * @param letter 282 * The letter under investigation 283 * @return True if a vowel, else false 284 */ 285 boolean isVowel(final String letter) { 286 return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") || 287 letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U"); 288 } 289 290 /** 291 * Processes the names from left to right (first) then right to left removing identical letters in same positions. 292 * Then subtracts the longer string that remains from 6 and returns this. 293 * 294 * <h2>API Usage</h2> 295 * <p> 296 * Consider this method private, it is package protected for unit testing only. 297 * </p> 298 * 299 * @param name1 first name. 300 * @param name1 second name. 301 * @return the length as above. 302 */ 303 int leftToRightThenRightToLeftProcessing(final String name1, final String name2) { 304 final char[] name1Char = name1.toCharArray(); 305 final char[] name2Char = name2.toCharArray(); 306 307 final int name1Size = name1.length() - 1; 308 final int name2Size = name2.length() - 1; 309 310 String name1LtRStart = EMPTY; 311 String name1LtREnd = EMPTY; 312 313 String name2RtLStart = EMPTY; 314 String name2RtLEnd = EMPTY; 315 316 for (int i = 0; i < name1Char.length; i++) { 317 if (i > name2Size) { 318 break; 319 } 320 321 name1LtRStart = name1.substring(i, i + 1); 322 name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1); 323 324 name2RtLStart = name2.substring(i, i + 1); 325 name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1); 326 327 // Left to right... 328 if (name1LtRStart.equals(name2RtLStart)) { 329 name1Char[i] = ' '; 330 name2Char[i] = ' '; 331 } 332 333 // Right to left... 334 if (name1LtREnd.equals(name2RtLEnd)) { 335 name1Char[name1Size - i] = ' '; 336 name2Char[name2Size - i] = ' '; 337 } 338 } 339 340 // Char arrays -> string & remove extraneous space 341 final String strA = new String(name1Char).replaceAll("\\s+", EMPTY); 342 final String strB = new String(name2Char).replaceAll("\\s+", EMPTY); 343 344 // Final bit - subtract the longest string from 6 and return this int value 345 if (strA.length() > strB.length()) { 346 return Math.abs(6 - strA.length()); 347 } 348 return Math.abs(6 - strB.length()); 349 } 350 351 /** 352 * Removes accented letters and replaces with non-accented ASCII equivalent Case is preserved. 353 * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29 354 * 355 * @param accentedWord 356 * The word that may have accents in it. 357 * @return De-accented word. 358 */ 359 String removeAccents(final String accentedWord) { 360 if (accentedWord == null) { 361 return null; 362 } 363 364 final StringBuilder sb = new StringBuilder(); 365 final int n = accentedWord.length(); 366 367 for (int i = 0; i < n; i++) { 368 final char c = accentedWord.charAt(i); 369 final int pos = UNICODE.indexOf(c); 370 if (pos > -1) { 371 sb.append(PLAIN_ASCII.charAt(pos)); 372 } else { 373 sb.append(c); 374 } 375 } 376 377 return sb.toString(); 378 } 379 380 /** 381 * Replaces any double consonant pair with the single letter equivalent. 382 * 383 * <h2>API Usage</h2> 384 * <p> 385 * Consider this method private, it is package protected for unit testing only. 386 * </p> 387 * 388 * @param name 389 * String to have double consonants removed. 390 * @return Single consonant word. 391 */ 392 String removeDoubleConsonants(final String name) { 393 String replacedName = name.toUpperCase(Locale.ENGLISH); 394 for (final String dc : DOUBLE_CONSONANT) { 395 if (replacedName.contains(dc)) { 396 final String singleLetter = dc.substring(0, 1); 397 replacedName = replacedName.replace(dc, singleLetter); 398 } 399 } 400 return replacedName; 401 } 402 403 /** 404 * Deletes all vowels unless the vowel begins the word. 405 * 406 * <h2>API Usage</h2> 407 * <p> 408 * Consider this method private, it is package protected for unit testing only. 409 * </p> 410 * 411 * @param name 412 * The name to have vowels removed. 413 * @return De-voweled word. 414 */ 415 String removeVowels(String name) { 416 // Extract first letter 417 final String firstLetter = name.substring(0, 1); 418 419 name = name.replace("A", EMPTY); 420 name = name.replace("E", EMPTY); 421 name = name.replace("I", EMPTY); 422 name = name.replace("O", EMPTY); 423 name = name.replace("U", EMPTY); 424 425 name = name.replaceAll("\\s{2,}\\b", SPACE); 426 427 // return isVowel(firstLetter) ? (firstLetter + name) : name; 428 if (isVowel(firstLetter)) { 429 return firstLetter + name; 430 } 431 return name; 432 } 433}