1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23 /**
24 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a general purpose scheme to find word
25 * with similar phonemes.
26 *
27 * <p>
28 * This class is thread-safe. Although not strictly immutable, the mutable fields are not actually used.
29 * </p>
30 */
31 public class Soundex implements StringEncoder {
32
33 /**
34 * The marker character used to indicate a silent (ignored) character. These are ignored except when they appear as the first character.
35 * <p>
36 * Note: The {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism because changing it might break existing code. Mappings that don't contain a
37 * silent marker code are treated as though H and W are silent.
38 * </p>
39 * <p>
40 * To override this, use the {@link #Soundex(String, boolean)} constructor.
41 * </p>
42 *
43 * @since 1.11
44 */
45 public static final char SILENT_MARKER = '-';
46
47 /**
48 * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position means do not encode, but treat as a separator
49 * when it occurs between consonants with the same code.
50 * <p>
51 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick up the value for the constant values page.)
52 * </p>
53 * <p>
54 * <strong>Note that letters H and W are treated specially.</strong> They are ignored (after the first letter) and don't act as separators between
55 * consonants with the same code.
56 * </p>
57 */
58 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
59
60 /**
61 * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position means do not encode.
62 *
63 * @see Soundex#Soundex(char[])
64 */
65 private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
66
67 /**
68 * An instance of Soundex using the US_ENGLISH_MAPPING mapping. This treats H and W as silent letters. Apart from when they appear as the first letter, they
69 * are ignored. They don't act as separators between duplicate codes.
70 *
71 * @see #US_ENGLISH_MAPPING_STRING
72 */
73 public static final Soundex US_ENGLISH = new Soundex();
74
75 /**
76 * An instance of Soundex using the Simplified Soundex mapping, as described here: http://west-penwith.org.uk/misc/soundex.htm
77 * <p>
78 * This treats H and W the same as vowels (AEIOUY). Such letters aren't encoded (after the first), but they do act as separators when dropping duplicate
79 * codes. The mapping is otherwise the same as for {@link #US_ENGLISH}.
80 * </p>
81 *
82 * @since 1.11
83 */
84 public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
85
86 /**
87 * An instance of Soundex using the mapping as per the Genealogy site: http://www.genealogy.com/articles/research/00000060.html
88 * <p>
89 * This treats vowels (AEIOUY), H and W as silent letters. Such letters are ignored (after the first) and do not act as separators when dropping duplicate
90 * codes.
91 * </p>
92 * <p>
93 * The codes for consonants are otherwise the same as for {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}.
94 * </p>
95 *
96 * @since 1.11
97 */
98 public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
99
100 /**
101 * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
102 *
103 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
104 */
105 @Deprecated
106 private int maxLength = 4;
107
108 /**
109 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each letter is mapped. This implementation
110 * contains a default map for US_ENGLISH.
111 */
112 private final char[] soundexMapping;
113
114 /**
115 * Should H and W be treated specially?
116 * <p>
117 * In versions of the code prior to 1.11, the code always treated H and W as silent (ignored) letters. If this field is false, H and W are no longer
118 * special-cased.
119 * </p>
120 */
121 private final boolean specialCaseHW;
122
123 /**
124 * Creates an instance using US_ENGLISH_MAPPING.
125 *
126 * @see Soundex#Soundex(char[])
127 * @see Soundex#US_ENGLISH_MAPPING_STRING
128 */
129 public Soundex() {
130 this.soundexMapping = US_ENGLISH_MAPPING;
131 this.specialCaseHW = true;
132 }
133
134 /**
135 * Creates a Soundex instance using the given mapping. This constructor can be used to provide an internationalized mapping for a non-Western character set.
136 * <p>
137 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each letter is mapped. This implementation
138 * contains a default map for US_ENGLISH
139 * </p>
140 * <p>
141 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment.
142 * </p>
143 *
144 * @param mapping Mapping array to use when finding the corresponding code for a given character.
145 */
146 public Soundex(final char[] mapping) {
147 this.soundexMapping = mapping.clone();
148 this.specialCaseHW = !hasMarker(this.soundexMapping);
149 }
150
151 /**
152 * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, and/or possibly provide an
153 * internationalized mapping for a non-Western character set.
154 * <p>
155 * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment.
156 * </p>
157 *
158 * @param mapping Mapping string to use when finding the corresponding code for a given character.
159 * @since 1.4
160 */
161 public Soundex(final String mapping) {
162 this.soundexMapping = mapping.toCharArray();
163 this.specialCaseHW = !hasMarker(this.soundexMapping);
164 }
165
166 /**
167 * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, and/or possibly provide an
168 * internationalized mapping for a non-Western character set.
169 *
170 * @param mapping Mapping string to use when finding the corresponding code for a given character.
171 * @param specialCaseHW if true, then H and W are treated as silent letters that are ignored and do not act as separators between duplicate codes.
172 * @since 1.11
173 */
174 public Soundex(final String mapping, final boolean specialCaseHW) {
175 this.soundexMapping = mapping.toCharArray();
176 this.specialCaseHW = specialCaseHW;
177 }
178
179 /**
180 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This return value ranges from 0 through 4: 0
181 * indicates little or no similarity, and 4 indicates strong similarity or identical values.
182 *
183 * @param s1 A String that will be encoded and compared.
184 * @param s2 A String that will be encoded and compared.
185 * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
186 * @see SoundexUtils#difference(StringEncoder,String,String)
187 * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS T-SQL DIFFERENCE</a>
188 * @throws EncoderException if an error occurs encoding one of the strings.
189 * @since 1.3
190 */
191 public int difference(final String s1, final String s2) throws EncoderException {
192 return SoundexUtils.difference(this, s1, s2);
193 }
194
195 /**
196 * Encodes an Object using the Soundex algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
197 * EncoderException if the supplied object is not of type {@link String}.
198 *
199 * @param obj Object to encode.
200 * @return An object (or type {@link String}) containing the Soundex code which corresponds to the String supplied.
201 * @throws EncoderException if the parameter supplied is not of type {@link String}.
202 * @throws IllegalArgumentException if a character is not mapped.
203 */
204 @Override
205 public Object encode(final Object obj) throws EncoderException {
206 if (!(obj instanceof String)) {
207 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
208 }
209 return soundex((String) obj);
210 }
211
212 /**
213 * Encodes a String using the Soundex algorithm.
214 *
215 * @param str A String object to encode.
216 * @return A Soundex code corresponding to the String supplied.
217 * @throws IllegalArgumentException if a character is not mapped.
218 */
219 @Override
220 public String encode(final String str) {
221 return soundex(str);
222 }
223
224 /**
225 * Returns the maxLength. Standard Soundex
226 *
227 * @return the maxLength.
228 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
229 */
230 @Deprecated
231 public int getMaxLength() {
232 return this.maxLength;
233 }
234
235 private boolean hasMarker(final char[] mapping) {
236 for (final char ch : mapping) {
237 if (ch == SILENT_MARKER) {
238 return true;
239 }
240 }
241 return false;
242 }
243
244 /**
245 * Maps the given upper-case character to its Soundex code.
246 *
247 * @param ch An upper-case character.
248 * @return A Soundex code.
249 * @throws IllegalArgumentException Thrown if {@code ch} is not mapped.
250 */
251 private char map(final char ch) {
252 final int index = ch - 'A';
253 if (index < 0 || index >= this.soundexMapping.length) {
254 throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
255 }
256 return this.soundexMapping[index];
257 }
258
259 /**
260 * Sets the maxLength.
261 *
262 * @param maxLength The maxLength to set.
263 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
264 */
265 @Deprecated
266 public void setMaxLength(final int maxLength) {
267 this.maxLength = maxLength;
268 }
269
270 /**
271 * Retrieves the Soundex code for a given String object.
272 *
273 * @param str String to encode using the Soundex algorithm.
274 * @return A Soundex code for the String supplied.
275 * @throws IllegalArgumentException if a character is not mapped.
276 */
277 public String soundex(String str) {
278 if (str == null) {
279 return null;
280 }
281 str = SoundexUtils.clean(str);
282 if (str.isEmpty()) {
283 return str;
284 }
285 final char[] out = { '0', '0', '0', '0' };
286 int count = 0;
287 final char first = str.charAt(0);
288 out[count++] = first;
289 char lastDigit = map(first); // previous digit
290 for (int i = 1; i < str.length() && count < out.length; i++) {
291 final char ch = str.charAt(i);
292 if (this.specialCaseHW && (ch == 'H' || ch == 'W')) { // these are ignored completely
293 continue;
294 }
295 final char digit = map(ch);
296 if (digit == SILENT_MARKER) {
297 continue;
298 }
299 if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
300 out[count++] = digit;
301 }
302 lastDigit = digit;
303 }
304 return new String(out);
305 }
306 }