001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022 023/** 024 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a general purpose scheme to find word 025 * with similar phonemes. 026 * 027 * <p> 028 * This class is thread-safe. Although not strictly immutable, the mutable fields are not actually used. 029 * </p> 030 */ 031public class Soundex implements StringEncoder { 032 033 /** 034 * The marker character used to indicate a silent (ignored) character. These are ignored except when they appear as the first character. 035 * <p> 036 * Note: The {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism because changing it might break existing code. Mappings that don't contain a 037 * silent marker code are treated as though H and W are silent. 038 * </p> 039 * <p> 040 * To override this, use the {@link #Soundex(String, boolean)} constructor. 041 * </p> 042 * 043 * @since 1.11 044 */ 045 public static final char SILENT_MARKER = '-'; 046 047 /** 048 * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position means do not encode, but treat as a separator 049 * when it occurs between consonants with the same code. 050 * <p> 051 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick up the value for the constant values page.) 052 * </p> 053 * <p> 054 * <strong>Note that letters H and W are treated specially.</strong> They are ignored (after the first letter) and don't act as separators between 055 * consonants with the same code. 056 * </p> 057 */ 058 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; 059 060 /** 061 * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position means do not encode. 062 * 063 * @see Soundex#Soundex(char[]) 064 */ 065 private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); 066 067 /** 068 * An instance of Soundex using the US_ENGLISH_MAPPING mapping. This treats H and W as silent letters. Apart from when they appear as the first letter, they 069 * are ignored. They don't act as separators between duplicate codes. 070 * 071 * @see #US_ENGLISH_MAPPING_STRING 072 */ 073 public static final Soundex US_ENGLISH = new Soundex(); 074 075 /** 076 * An instance of Soundex using the Simplified Soundex mapping, as described here: http://west-penwith.org.uk/misc/soundex.htm 077 * <p> 078 * This treats H and W the same as vowels (AEIOUY). Such letters aren't encoded (after the first), but they do act as separators when dropping duplicate 079 * codes. The mapping is otherwise the same as for {@link #US_ENGLISH}. 080 * </p> 081 * 082 * @since 1.11 083 */ 084 public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false); 085 086 /** 087 * An instance of Soundex using the mapping as per the Genealogy site: http://www.genealogy.com/articles/research/00000060.html 088 * <p> 089 * This treats vowels (AEIOUY), H and W as silent letters. Such letters are ignored (after the first) and do not act as separators when dropping duplicate 090 * codes. 091 * </p> 092 * <p> 093 * The codes for consonants are otherwise the same as for {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}. 094 * </p> 095 * 096 * @since 1.11 097 */ 098 public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2"); 099 100 /** 101 * The maximum length of a Soundex code - Soundex codes are only four characters by definition. 102 * 103 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 104 */ 105 @Deprecated 106 private int maxLength = 4; 107 108 /** 109 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each letter is mapped. This implementation 110 * contains a default map for US_ENGLISH. 111 */ 112 private final char[] soundexMapping; 113 114 /** 115 * Should H and W be treated specially? 116 * <p> 117 * In versions of the code prior to 1.11, the code always treated H and W as silent (ignored) letters. If this field is false, H and W are no longer 118 * special-cased. 119 * </p> 120 */ 121 private final boolean specialCaseHW; 122 123 /** 124 * Creates an instance using US_ENGLISH_MAPPING. 125 * 126 * @see Soundex#Soundex(char[]) 127 * @see Soundex#US_ENGLISH_MAPPING_STRING 128 */ 129 public Soundex() { 130 this.soundexMapping = US_ENGLISH_MAPPING; 131 this.specialCaseHW = true; 132 } 133 134 /** 135 * Creates a Soundex instance using the given mapping. This constructor can be used to provide an internationalized mapping for a non-Western character set. 136 * <p> 137 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each letter is mapped. This implementation 138 * contains a default map for US_ENGLISH 139 * </p> 140 * <p> 141 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment. 142 * </p> 143 * 144 * @param mapping Mapping array to use when finding the corresponding code for a given character. 145 */ 146 public Soundex(final char[] mapping) { 147 this.soundexMapping = mapping.clone(); 148 this.specialCaseHW = !hasMarker(this.soundexMapping); 149 } 150 151 /** 152 * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, and/or possibly provide an 153 * internationalized mapping for a non-Western character set. 154 * <p> 155 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment. 156 * </p> 157 * 158 * @param mapping Mapping string to use when finding the corresponding code for a given character. 159 * @since 1.4 160 */ 161 public Soundex(final String mapping) { 162 this.soundexMapping = mapping.toCharArray(); 163 this.specialCaseHW = !hasMarker(this.soundexMapping); 164 } 165 166 /** 167 * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, and/or possibly provide an 168 * internationalized mapping for a non-Western character set. 169 * 170 * @param mapping Mapping string to use when finding the corresponding code for a given character. 171 * @param specialCaseHW if true, then H and W are treated as silent letters that are ignored and do not act as separators between duplicate codes. 172 * @since 1.11 173 */ 174 public Soundex(final String mapping, final boolean specialCaseHW) { 175 this.soundexMapping = mapping.toCharArray(); 176 this.specialCaseHW = specialCaseHW; 177 } 178 179 /** 180 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This return value ranges from 0 through 4: 0 181 * indicates little or no similarity, and 4 indicates strong similarity or identical values. 182 * 183 * @param s1 A String that will be encoded and compared. 184 * @param s2 A String that will be encoded and compared. 185 * @return The number of characters in the two encoded Strings that are the same from 0 to 4. 186 * @see SoundexUtils#difference(StringEncoder,String,String) 187 * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS T-SQL DIFFERENCE</a> 188 * @throws EncoderException if an error occurs encoding one of the strings. 189 * @since 1.3 190 */ 191 public int difference(final String s1, final String s2) throws EncoderException { 192 return SoundexUtils.difference(this, s1, s2); 193 } 194 195 /** 196 * Encodes an Object using the Soundex algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an 197 * EncoderException if the supplied object is not of type {@link String}. 198 * 199 * @param obj Object to encode. 200 * @return An object (or type {@link String}) containing the Soundex code which corresponds to the String supplied. 201 * @throws EncoderException if the parameter supplied is not of type {@link String}. 202 * @throws IllegalArgumentException if a character is not mapped. 203 */ 204 @Override 205 public Object encode(final Object obj) throws EncoderException { 206 if (!(obj instanceof String)) { 207 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); 208 } 209 return soundex((String) obj); 210 } 211 212 /** 213 * Encodes a String using the Soundex algorithm. 214 * 215 * @param str A String object to encode. 216 * @return A Soundex code corresponding to the String supplied. 217 * @throws IllegalArgumentException if a character is not mapped. 218 */ 219 @Override 220 public String encode(final String str) { 221 return soundex(str); 222 } 223 224 /** 225 * Returns the maxLength. Standard Soundex 226 * 227 * @return the maxLength. 228 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 229 */ 230 @Deprecated 231 public int getMaxLength() { 232 return this.maxLength; 233 } 234 235 private boolean hasMarker(final char[] mapping) { 236 for (final char ch : mapping) { 237 if (ch == SILENT_MARKER) { 238 return true; 239 } 240 } 241 return false; 242 } 243 244 /** 245 * Maps the given upper-case character to its Soundex code. 246 * 247 * @param ch An upper-case character. 248 * @return A Soundex code. 249 * @throws IllegalArgumentException Thrown if {@code ch} is not mapped. 250 */ 251 private char map(final char ch) { 252 final int index = ch - 'A'; 253 if (index < 0 || index >= this.soundexMapping.length) { 254 throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")"); 255 } 256 return this.soundexMapping[index]; 257 } 258 259 /** 260 * Sets the maxLength. 261 * 262 * @param maxLength The maxLength to set. 263 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 264 */ 265 @Deprecated 266 public void setMaxLength(final int maxLength) { 267 this.maxLength = maxLength; 268 } 269 270 /** 271 * Retrieves the Soundex code for a given String object. 272 * 273 * @param str String to encode using the Soundex algorithm. 274 * @return A Soundex code for the String supplied. 275 * @throws IllegalArgumentException if a character is not mapped. 276 */ 277 public String soundex(String str) { 278 if (str == null) { 279 return null; 280 } 281 str = SoundexUtils.clean(str); 282 if (str.isEmpty()) { 283 return str; 284 } 285 final char[] out = { '0', '0', '0', '0' }; 286 int count = 0; 287 final char first = str.charAt(0); 288 out[count++] = first; 289 char lastDigit = map(first); // previous digit 290 for (int i = 1; i < str.length() && count < out.length; i++) { 291 final char ch = str.charAt(i); 292 if (this.specialCaseHW && (ch == 'H' || ch == 'W')) { // these are ignored completely 293 continue; 294 } 295 final char digit = map(ch); 296 if (digit == SILENT_MARKER) { 297 continue; 298 } 299 if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats 300 out[count++] = digit; 301 } 302 lastDigit = digit; 303 } 304 return new String(out); 305 } 306}