001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.net;
019
020import java.io.ByteArrayOutputStream;
021import java.io.UnsupportedEncodingException;
022import java.util.BitSet;
023
024import org.apache.commons.codec.BinaryDecoder;
025import org.apache.commons.codec.BinaryEncoder;
026import org.apache.commons.codec.CharEncoding;
027import org.apache.commons.codec.DecoderException;
028import org.apache.commons.codec.EncoderException;
029import org.apache.commons.codec.StringDecoder;
030import org.apache.commons.codec.StringEncoder;
031import org.apache.commons.codec.binary.StringUtils;
032
033/**
034 * Implements the 'www-form-urlencoded' encoding scheme, also misleadingly known as URL encoding.
035 * <p>
036 * This codec is meant to be a replacement for standard Java classes {@link java.net.URLEncoder} and
037 * {@link java.net.URLDecoder} on older Java platforms, as these classes in Java versions below
038 * 1.4 rely on the platform's default charset encoding.
039 * </p>
040 * <p>
041 * This class is thread-safe as of 1.11
042 * </p>
043 *
044 * @see <a href="https://www.w3.org/TR/html4/interact/forms.html#h-17.13.4.1">Chapter 17.13.4 Form content types</a>
045 *           of the <a href="https://www.w3.org/TR/html4/">HTML 4.01 Specification</a>
046 *
047 * @since 1.2
048 */
049public class URLCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
050
051    /**
052     * Release 1.5 made this field final.
053     */
054    protected static final byte ESCAPE_CHAR = '%';
055
056    /**
057     * BitSet of www-form-url safe characters.
058     * This is a copy of the internal BitSet which is now used for the conversion.
059     * Changes to this field are ignored.
060     *
061     * @deprecated 1.11 Will be removed in 2.0 (CODEC-230)
062     */
063    @Deprecated
064    protected static final BitSet WWW_FORM_URL;
065
066    private static final BitSet WWW_FORM_URL_SAFE = new BitSet(256);
067
068    // Static initializer for www_form_url
069    static {
070        // alpha characters
071        for (int i = 'a'; i <= 'z'; i++) {
072            WWW_FORM_URL_SAFE.set(i);
073        }
074        for (int i = 'A'; i <= 'Z'; i++) {
075            WWW_FORM_URL_SAFE.set(i);
076        }
077        // numeric characters
078        for (int i = '0'; i <= '9'; i++) {
079            WWW_FORM_URL_SAFE.set(i);
080        }
081        // special chars
082        WWW_FORM_URL_SAFE.set('-');
083        WWW_FORM_URL_SAFE.set('_');
084        WWW_FORM_URL_SAFE.set('.');
085        WWW_FORM_URL_SAFE.set('*');
086        // blank to be replaced with +
087        WWW_FORM_URL_SAFE.set(' ');
088
089        // Create a copy in case anyone (ab)uses it
090        WWW_FORM_URL = (BitSet) WWW_FORM_URL_SAFE.clone();
091    }
092
093    /**
094     * Decodes an array of URL safe 7-bit characters into an array of original bytes. Escaped characters are converted
095     * back to their original representation.
096     *
097     * @param bytes
098     *            array of URL safe characters.
099     * @return array of original bytes.
100     * @throws DecoderException
101     *             Thrown if URL decoding is unsuccessful.
102     */
103    public static final byte[] decodeUrl(final byte[] bytes) throws DecoderException {
104        if (bytes == null) {
105            return null;
106        }
107        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
108        for (int i = 0; i < bytes.length; i++) {
109            final int b = bytes[i];
110            if (b == '+') {
111                buffer.write(' ');
112            } else if (b == ESCAPE_CHAR) {
113                try {
114                    final int u = Utils.digit16(bytes[++i]);
115                    final int l = Utils.digit16(bytes[++i]);
116                    buffer.write((char) ((u << 4) + l));
117                } catch (final ArrayIndexOutOfBoundsException e) {
118                    throw new DecoderException("Invalid URL encoding: ", e);
119                }
120            } else {
121                buffer.write(b);
122            }
123        }
124        return buffer.toByteArray();
125    }
126
127    /**
128     * Encodes an array of bytes into an array of URL safe 7-bit characters. Unsafe characters are escaped.
129     *
130     * @param urlsafe
131     *            bitset of characters deemed URL safe.
132     * @param bytes
133     *            array of bytes to convert to URL safe characters.
134     * @return array of bytes containing URL safe characters.
135     */
136    public static final byte[] encodeUrl(BitSet urlsafe, final byte[] bytes) {
137        if (bytes == null) {
138            return null;
139        }
140        if (urlsafe == null) {
141            urlsafe = WWW_FORM_URL_SAFE;
142        }
143
144        final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
145        for (final byte c : bytes) {
146            int b = c;
147            if (b < 0) {
148                b = 256 + b;
149            }
150            if (urlsafe.get(b)) {
151                if (b == ' ') {
152                    b = '+';
153                }
154                buffer.write(b);
155            } else {
156                buffer.write(ESCAPE_CHAR);
157                final char hex1 = Utils.hexChar(b >> 4);
158                final char hex2 = Utils.hexChar(b);
159                buffer.write(hex1);
160                buffer.write(hex2);
161            }
162        }
163        return buffer.toByteArray();
164    }
165
166    /**
167     * The default charset used for string decoding and encoding.
168     *
169     * @deprecated TODO: This field will be changed to a private final Charset in 2.0. (CODEC-126)
170     */
171    @Deprecated
172    protected volatile String charset; // added volatile: see CODEC-232
173
174    /**
175     * Default constructor.
176     */
177    public URLCodec() {
178        this(CharEncoding.UTF_8);
179    }
180
181    /**
182     * Constructor which allows for the selection of a default charset.
183     *
184     * @param charset the default string charset to use.
185     */
186    public URLCodec(final String charset) {
187        this.charset = charset;
188    }
189
190    /**
191     * Decodes an array of URL safe 7-bit characters into an array of original bytes. Escaped characters are converted
192     * back to their original representation.
193     *
194     * @param bytes
195     *            array of URL safe characters.
196     * @return array of original bytes.
197     * @throws DecoderException
198     *             Thrown if URL decoding is unsuccessful.
199     */
200    @Override
201    public byte[] decode(final byte[] bytes) throws DecoderException {
202        return decodeUrl(bytes);
203    }
204
205    /**
206     * Decodes a URL safe object into its original form. Escaped characters are converted back to their original
207     * representation.
208     *
209     * @param obj
210     *            URL safe object to convert into its original form.
211     * @return original object.
212     * @throws DecoderException
213     *             Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure
214     *             condition is encountered during the decode process.
215     */
216    @Override
217    public Object decode(final Object obj) throws DecoderException {
218        if (obj == null) {
219            return null;
220        }
221        if (obj instanceof byte[]) {
222            return decode((byte[]) obj);
223        }
224        if (obj instanceof String) {
225            return decode((String) obj);
226        }
227        throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be URL decoded");
228    }
229
230    /**
231     * Decodes a URL safe string into its original form using the default string charset. Escaped characters are
232     * converted back to their original representation.
233     *
234     * @param str
235     *            URL safe string to convert into its original form.
236     * @return original string.
237     * @throws DecoderException
238     *             Thrown if URL decoding is unsuccessful.
239     * @see #getDefaultCharset()
240     */
241    @Override
242    public String decode(final String str) throws DecoderException {
243        if (str == null) {
244            return null;
245        }
246        try {
247            return decode(str, getDefaultCharset());
248        } catch (final UnsupportedEncodingException e) {
249            throw new DecoderException(e.getMessage(), e);
250        }
251    }
252
253    /**
254     * Decodes a URL safe string into its original form using the specified encoding. Escaped characters are converted
255     * back to their original representation.
256     *
257     * @param str
258     *            URL safe string to convert into its original form.
259     * @param charsetName
260     *            the original string charset.
261     * @return original string.
262     * @throws DecoderException
263     *             Thrown if URL decoding is unsuccessful.
264     * @throws UnsupportedEncodingException
265     *             Thrown if charset is not supported.
266     */
267    public String decode(final String str, final String charsetName)
268            throws DecoderException, UnsupportedEncodingException {
269        if (str == null) {
270            return null;
271        }
272        return new String(decode(StringUtils.getBytesUsAscii(str)), charsetName);
273    }
274
275    /**
276     * Encodes an array of bytes into an array of URL safe 7-bit characters. Unsafe characters are escaped.
277     *
278     * @param bytes
279     *            array of bytes to convert to URL safe characters.
280     * @return array of bytes containing URL safe characters.
281     */
282    @Override
283    public byte[] encode(final byte[] bytes) {
284        return encodeUrl(WWW_FORM_URL_SAFE, bytes);
285    }
286
287    /**
288     * Encodes an object into its URL safe form. Unsafe characters are escaped.
289     *
290     * @param obj
291     *            string to convert to a URL safe form.
292     * @return URL safe object.
293     * @throws EncoderException
294     *             Thrown if URL encoding is not applicable to objects of this type or if encoding is unsuccessful.
295     */
296    @Override
297    public Object encode(final Object obj) throws EncoderException {
298        if (obj == null) {
299            return null;
300        }
301        if (obj instanceof byte[]) {
302            return encode((byte[]) obj);
303        }
304        if (obj instanceof String) {
305            return encode((String) obj);
306        }
307        throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be URL encoded");
308    }
309
310    /**
311     * Encodes a string into its URL safe form using the default string charset. Unsafe characters are escaped.
312     *
313     * @param str
314     *            string to convert to a URL safe form.
315     * @return URL safe string.
316     * @throws EncoderException
317     *             Thrown if URL encoding is unsuccessful.
318     * @see #getDefaultCharset()
319     */
320    @Override
321    public String encode(final String str) throws EncoderException {
322        if (str == null) {
323            return null;
324        }
325        try {
326            return encode(str, getDefaultCharset());
327        } catch (final UnsupportedEncodingException e) {
328            throw new EncoderException(e.getMessage(), e);
329        }
330    }
331
332    /**
333     * Encodes a string into its URL safe form using the specified string charset. Unsafe characters are escaped.
334     *
335     * @param str
336     *            string to convert to a URL safe form.
337     * @param charsetName
338     *            the charset for str.
339     * @return URL safe string.
340     * @throws UnsupportedEncodingException
341     *             Thrown if charset is not supported.
342     */
343    public String encode(final String str, final String charsetName) throws UnsupportedEncodingException {
344        if (str == null) {
345            return null;
346        }
347        return StringUtils.newStringUsAscii(encode(str.getBytes(charsetName)));
348    }
349
350    /**
351     * The default charset used for string decoding and encoding.
352     *
353     * @return the default string charset.
354     */
355    public String getDefaultCharset() {
356        return this.charset;
357    }
358
359    /**
360     * The {@code String} encoding used for decoding and encoding.
361     *
362     * @return the encoding.
363     * @deprecated Use {@link #getDefaultCharset()}, will be removed in 2.0.
364     */
365    @Deprecated
366    public String getEncoding() {
367        return this.charset;
368    }
369
370}