001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a general purpose scheme to find word
025 * with similar phonemes.
026 *
027 * <p>
028 * This class is thread-safe. Although not strictly immutable, the mutable fields are not actually used.
029 * </p>
030 */
031public class Soundex implements StringEncoder {
032
033    /**
034     * The marker character used to indicate a silent (ignored) character. These are ignored except when they appear as the first character.
035     * <p>
036     * Note: The {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism because changing it might break existing code. Mappings that don't contain a
037     * silent marker code are treated as though H and W are silent.
038     * </p>
039     * <p>
040     * To override this, use the {@link #Soundex(String, boolean)} constructor.
041     * </p>
042     *
043     * @since 1.11
044     */
045    public static final char SILENT_MARKER = '-';
046
047    /**
048     * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position means do not encode, but treat as a separator
049     * when it occurs between consonants with the same code.
050     * <p>
051     * (This constant is provided as both an implementation convenience and to allow Javadoc to pick up the value for the constant values page.)
052     * </p>
053     * <p>
054     * <strong>Note that letters H and W are treated specially.</strong> They are ignored (after the first letter) and don't act as separators between
055     * consonants with the same code.
056     * </p>
057     */
058    public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
059
060    /**
061     * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position means do not encode.
062     *
063     * @see Soundex#Soundex(char[])
064     */
065    private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
066
067    /**
068     * An instance of Soundex using the US_ENGLISH_MAPPING mapping. This treats H and W as silent letters. Apart from when they appear as the first letter, they
069     * are ignored. They don't act as separators between duplicate codes.
070     *
071     * @see #US_ENGLISH_MAPPING_STRING
072     */
073    public static final Soundex US_ENGLISH = new Soundex();
074
075    /**
076     * An instance of Soundex using the Simplified Soundex mapping, as described here: http://west-penwith.org.uk/misc/soundex.htm
077     * <p>
078     * This treats H and W the same as vowels (AEIOUY). Such letters aren't encoded (after the first), but they do act as separators when dropping duplicate
079     * codes. The mapping is otherwise the same as for {@link #US_ENGLISH}.
080     * </p>
081     *
082     * @since 1.11
083     */
084    public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
085
086    /**
087     * An instance of Soundex using the mapping as per the Genealogy site: http://www.genealogy.com/articles/research/00000060.html
088     * <p>
089     * This treats vowels (AEIOUY), H and W as silent letters. Such letters are ignored (after the first) and do not act as separators when dropping duplicate
090     * codes.
091     * </p>
092     * <p>
093     * The codes for consonants are otherwise the same as for {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}.
094     * </p>
095     *
096     * @since 1.11
097     */
098    public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
099
100    /**
101     * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
102     *
103     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
104     */
105    @Deprecated
106    private int maxLength = 4;
107
108    /**
109     * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each letter is mapped. This implementation
110     * contains a default map for US_ENGLISH.
111     */
112    private final char[] soundexMapping;
113
114    /**
115     * Should H and W be treated specially?
116     * <p>
117     * In versions of the code prior to 1.11, the code always treated H and W as silent (ignored) letters. If this field is false, H and W are no longer
118     * special-cased.
119     * </p>
120     */
121    private final boolean specialCaseHW;
122
123    /**
124     * Creates an instance using US_ENGLISH_MAPPING.
125     *
126     * @see Soundex#Soundex(char[])
127     * @see Soundex#US_ENGLISH_MAPPING_STRING
128     */
129    public Soundex() {
130        this.soundexMapping = US_ENGLISH_MAPPING;
131        this.specialCaseHW = true;
132    }
133
134    /**
135     * Creates a Soundex instance using the given mapping. This constructor can be used to provide an internationalized mapping for a non-Western character set.
136     * <p>
137     * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each letter is mapped. This implementation
138     * contains a default map for US_ENGLISH
139     * </p>
140     * <p>
141     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment.
142     * </p>
143     *
144     * @param mapping Mapping array to use when finding the corresponding code for a given character.
145     */
146    public Soundex(final char[] mapping) {
147        this.soundexMapping = mapping.clone();
148        this.specialCaseHW = !hasMarker(this.soundexMapping);
149    }
150
151    /**
152     * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, and/or possibly provide an
153     * internationalized mapping for a non-Western character set.
154     * <p>
155     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment.
156     * </p>
157     *
158     * @param mapping Mapping string to use when finding the corresponding code for a given character.
159     * @since 1.4
160     */
161    public Soundex(final String mapping) {
162        this.soundexMapping = mapping.toCharArray();
163        this.specialCaseHW = !hasMarker(this.soundexMapping);
164    }
165
166    /**
167     * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, and/or possibly provide an
168     * internationalized mapping for a non-Western character set.
169     *
170     * @param mapping       Mapping string to use when finding the corresponding code for a given character.
171     * @param specialCaseHW if true, then H and W are treated as silent letters that are ignored and do not act as separators between duplicate codes.
172     * @since 1.11
173     */
174    public Soundex(final String mapping, final boolean specialCaseHW) {
175        this.soundexMapping = mapping.toCharArray();
176        this.specialCaseHW = specialCaseHW;
177    }
178
179    /**
180     * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This return value ranges from 0 through 4: 0
181     * indicates little or no similarity, and 4 indicates strong similarity or identical values.
182     *
183     * @param s1 A String that will be encoded and compared.
184     * @param s2 A String that will be encoded and compared.
185     * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
186     * @see SoundexUtils#difference(StringEncoder,String,String)
187     * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS T-SQL DIFFERENCE</a>
188     * @throws EncoderException if an error occurs encoding one of the strings.
189     * @since 1.3
190     */
191    public int difference(final String s1, final String s2) throws EncoderException {
192        return SoundexUtils.difference(this, s1, s2);
193    }
194
195    /**
196     * Encodes an Object using the Soundex algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
197     * EncoderException if the supplied object is not of type {@link String}.
198     *
199     * @param obj Object to encode.
200     * @return An object (or type {@link String}) containing the Soundex code which corresponds to the String supplied.
201     * @throws EncoderException         if the parameter supplied is not of type {@link String}.
202     * @throws IllegalArgumentException if a character is not mapped.
203     */
204    @Override
205    public Object encode(final Object obj) throws EncoderException {
206        if (!(obj instanceof String)) {
207            throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
208        }
209        return soundex((String) obj);
210    }
211
212    /**
213     * Encodes a String using the Soundex algorithm.
214     *
215     * @param str A String object to encode.
216     * @return A Soundex code corresponding to the String supplied.
217     * @throws IllegalArgumentException if a character is not mapped.
218     */
219    @Override
220    public String encode(final String str) {
221        return soundex(str);
222    }
223
224    /**
225     * Returns the maxLength. Standard Soundex
226     *
227     * @return the maxLength.
228     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
229     */
230    @Deprecated
231    public int getMaxLength() {
232        return this.maxLength;
233    }
234
235    private boolean hasMarker(final char[] mapping) {
236        for (final char ch : mapping) {
237            if (ch == SILENT_MARKER) {
238                return true;
239            }
240        }
241        return false;
242    }
243
244    /**
245     * Maps the given upper-case character to its Soundex code.
246     *
247     * @param ch An upper-case character.
248     * @return A Soundex code.
249     * @throws IllegalArgumentException Thrown if {@code ch} is not mapped.
250     */
251    private char map(final char ch) {
252        final int index = ch - 'A';
253        if (index < 0 || index >= this.soundexMapping.length) {
254            throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
255        }
256        return this.soundexMapping[index];
257    }
258
259    /**
260     * Sets the maxLength.
261     *
262     * @param maxLength The maxLength to set.
263     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
264     */
265    @Deprecated
266    public void setMaxLength(final int maxLength) {
267        this.maxLength = maxLength;
268    }
269
270    /**
271     * Retrieves the Soundex code for a given String object.
272     *
273     * @param str String to encode using the Soundex algorithm.
274     * @return A Soundex code for the String supplied.
275     * @throws IllegalArgumentException if a character is not mapped.
276     */
277    public String soundex(String str) {
278        if (str == null) {
279            return null;
280        }
281        str = SoundexUtils.clean(str);
282        if (str.isEmpty()) {
283            return str;
284        }
285        final char[] out = { '0', '0', '0', '0' };
286        int count = 0;
287        final char first = str.charAt(0);
288        out[count++] = first;
289        char lastDigit = map(first); // previous digit
290        for (int i = 1; i < str.length() && count < out.length; i++) {
291            final char ch = str.charAt(i);
292            if (this.specialCaseHW && (ch == 'H' || ch == 'W')) { // these are ignored completely
293                continue;
294            }
295            final char digit = map(ch);
296            if (digit == SILENT_MARKER) {
297                continue;
298            }
299            if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
300                out[count++] = digit;
301            }
302            lastDigit = digit;
303        }
304        return new String(out);
305    }
306}