View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoder;
22  
23  /**
24   * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a general purpose scheme to find word
25   * with similar phonemes.
26   *
27   * <p>
28   * This class is thread-safe. Although not strictly immutable, the mutable fields are not actually used.
29   * </p>
30   */
31  public class Soundex implements StringEncoder {
32  
33      /**
34       * The marker character used to indicate a silent (ignored) character. These are ignored except when they appear as the first character.
35       * <p>
36       * Note: The {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism because changing it might break existing code. Mappings that don't contain a
37       * silent marker code are treated as though H and W are silent.
38       * </p>
39       * <p>
40       * To override this, use the {@link #Soundex(String, boolean)} constructor.
41       * </p>
42       *
43       * @since 1.11
44       */
45      public static final char SILENT_MARKER = '-';
46  
47      /**
48       * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position means do not encode, but treat as a separator
49       * when it occurs between consonants with the same code.
50       * <p>
51       * (This constant is provided as both an implementation convenience and to allow Javadoc to pick up the value for the constant values page.)
52       * </p>
53       * <p>
54       * <strong>Note that letters H and W are treated specially.</strong> They are ignored (after the first letter) and don't act as separators between
55       * consonants with the same code.
56       * </p>
57       */
58      public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
59  
60      /**
61       * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position means do not encode.
62       *
63       * @see Soundex#Soundex(char[])
64       */
65      private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
66  
67      /**
68       * An instance of Soundex using the US_ENGLISH_MAPPING mapping. This treats H and W as silent letters. Apart from when they appear as the first letter, they
69       * are ignored. They don't act as separators between duplicate codes.
70       *
71       * @see #US_ENGLISH_MAPPING_STRING
72       */
73      public static final Soundex US_ENGLISH = new Soundex();
74  
75      /**
76       * An instance of Soundex using the Simplified Soundex mapping, as described here: http://west-penwith.org.uk/misc/soundex.htm
77       * <p>
78       * This treats H and W the same as vowels (AEIOUY). Such letters aren't encoded (after the first), but they do act as separators when dropping duplicate
79       * codes. The mapping is otherwise the same as for {@link #US_ENGLISH}.
80       * </p>
81       *
82       * @since 1.11
83       */
84      public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
85  
86      /**
87       * An instance of Soundex using the mapping as per the Genealogy site: http://www.genealogy.com/articles/research/00000060.html
88       * <p>
89       * This treats vowels (AEIOUY), H and W as silent letters. Such letters are ignored (after the first) and do not act as separators when dropping duplicate
90       * codes.
91       * </p>
92       * <p>
93       * The codes for consonants are otherwise the same as for {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}.
94       * </p>
95       *
96       * @since 1.11
97       */
98      public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
99  
100     /**
101      * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
102      *
103      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
104      */
105     @Deprecated
106     private int maxLength = 4;
107 
108     /**
109      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each letter is mapped. This implementation
110      * contains a default map for US_ENGLISH.
111      */
112     private final char[] soundexMapping;
113 
114     /**
115      * Should H and W be treated specially?
116      * <p>
117      * In versions of the code prior to 1.11, the code always treated H and W as silent (ignored) letters. If this field is false, H and W are no longer
118      * special-cased.
119      * </p>
120      */
121     private final boolean specialCaseHW;
122 
123     /**
124      * Creates an instance using US_ENGLISH_MAPPING.
125      *
126      * @see Soundex#Soundex(char[])
127      * @see Soundex#US_ENGLISH_MAPPING_STRING
128      */
129     public Soundex() {
130         this.soundexMapping = US_ENGLISH_MAPPING;
131         this.specialCaseHW = true;
132     }
133 
134     /**
135      * Creates a Soundex instance using the given mapping. This constructor can be used to provide an internationalized mapping for a non-Western character set.
136      * <p>
137      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each letter is mapped. This implementation
138      * contains a default map for US_ENGLISH
139      * </p>
140      * <p>
141      * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment.
142      * </p>
143      *
144      * @param mapping Mapping array to use when finding the corresponding code for a given character.
145      */
146     public Soundex(final char[] mapping) {
147         this.soundexMapping = mapping.clone();
148         this.specialCaseHW = !hasMarker(this.soundexMapping);
149     }
150 
151     /**
152      * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, and/or possibly provide an
153      * internationalized mapping for a non-Western character set.
154      * <p>
155      * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment.
156      * </p>
157      *
158      * @param mapping Mapping string to use when finding the corresponding code for a given character.
159      * @since 1.4
160      */
161     public Soundex(final String mapping) {
162         this.soundexMapping = mapping.toCharArray();
163         this.specialCaseHW = !hasMarker(this.soundexMapping);
164     }
165 
166     /**
167      * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, and/or possibly provide an
168      * internationalized mapping for a non-Western character set.
169      *
170      * @param mapping       Mapping string to use when finding the corresponding code for a given character.
171      * @param specialCaseHW if true, then H and W are treated as silent letters that are ignored and do not act as separators between duplicate codes.
172      * @since 1.11
173      */
174     public Soundex(final String mapping, final boolean specialCaseHW) {
175         this.soundexMapping = mapping.toCharArray();
176         this.specialCaseHW = specialCaseHW;
177     }
178 
179     /**
180      * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This return value ranges from 0 through 4: 0
181      * indicates little or no similarity, and 4 indicates strong similarity or identical values.
182      *
183      * @param s1 A String that will be encoded and compared.
184      * @param s2 A String that will be encoded and compared.
185      * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
186      * @see SoundexUtils#difference(StringEncoder,String,String)
187      * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS T-SQL DIFFERENCE</a>
188      * @throws EncoderException if an error occurs encoding one of the strings.
189      * @since 1.3
190      */
191     public int difference(final String s1, final String s2) throws EncoderException {
192         return SoundexUtils.difference(this, s1, s2);
193     }
194 
195     /**
196      * Encodes an Object using the Soundex algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
197      * EncoderException if the supplied object is not of type {@link String}.
198      *
199      * @param obj Object to encode.
200      * @return An object (or type {@link String}) containing the Soundex code which corresponds to the String supplied.
201      * @throws EncoderException         if the parameter supplied is not of type {@link String}.
202      * @throws IllegalArgumentException if a character is not mapped.
203      */
204     @Override
205     public Object encode(final Object obj) throws EncoderException {
206         if (!(obj instanceof String)) {
207             throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
208         }
209         return soundex((String) obj);
210     }
211 
212     /**
213      * Encodes a String using the Soundex algorithm.
214      *
215      * @param str A String object to encode.
216      * @return A Soundex code corresponding to the String supplied.
217      * @throws IllegalArgumentException if a character is not mapped.
218      */
219     @Override
220     public String encode(final String str) {
221         return soundex(str);
222     }
223 
224     /**
225      * Returns the maxLength. Standard Soundex
226      *
227      * @return the maxLength.
228      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
229      */
230     @Deprecated
231     public int getMaxLength() {
232         return this.maxLength;
233     }
234 
235     private boolean hasMarker(final char[] mapping) {
236         for (final char ch : mapping) {
237             if (ch == SILENT_MARKER) {
238                 return true;
239             }
240         }
241         return false;
242     }
243 
244     /**
245      * Maps the given upper-case character to its Soundex code.
246      *
247      * @param ch An upper-case character.
248      * @return A Soundex code.
249      * @throws IllegalArgumentException Thrown if {@code ch} is not mapped.
250      */
251     private char map(final char ch) {
252         final int index = ch - 'A';
253         if (index < 0 || index >= this.soundexMapping.length) {
254             throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
255         }
256         return this.soundexMapping[index];
257     }
258 
259     /**
260      * Sets the maxLength.
261      *
262      * @param maxLength The maxLength to set.
263      * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
264      */
265     @Deprecated
266     public void setMaxLength(final int maxLength) {
267         this.maxLength = maxLength;
268     }
269 
270     /**
271      * Retrieves the Soundex code for a given String object.
272      *
273      * @param str String to encode using the Soundex algorithm.
274      * @return A Soundex code for the String supplied.
275      * @throws IllegalArgumentException if a character is not mapped.
276      */
277     public String soundex(String str) {
278         if (str == null) {
279             return null;
280         }
281         str = SoundexUtils.clean(str);
282         if (str.isEmpty()) {
283             return str;
284         }
285         final char[] out = { '0', '0', '0', '0' };
286         int count = 0;
287         final char first = str.charAt(0);
288         out[count++] = first;
289         char lastDigit = map(first); // previous digit
290         for (int i = 1; i < str.length() && count < out.length; i++) {
291             final char ch = str.charAt(i);
292             if (this.specialCaseHW && (ch == 'H' || ch == 'W')) { // these are ignored completely
293                 continue;
294             }
295             final char digit = map(ch);
296             if (digit == SILENT_MARKER) {
297                 continue;
298             }
299             if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
300                 out[count++] = digit;
301             }
302             lastDigit = digit;
303         }
304         return new String(out);
305     }
306 }