View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.net;
19  
20  import java.io.ByteArrayOutputStream;
21  import java.io.UnsupportedEncodingException;
22  import java.nio.charset.Charset;
23  import java.nio.charset.IllegalCharsetNameException;
24  import java.nio.charset.StandardCharsets;
25  import java.nio.charset.UnsupportedCharsetException;
26  import java.util.BitSet;
27  
28  import org.apache.commons.codec.BinaryDecoder;
29  import org.apache.commons.codec.BinaryEncoder;
30  import org.apache.commons.codec.DecoderException;
31  import org.apache.commons.codec.EncoderException;
32  import org.apache.commons.codec.StringDecoder;
33  import org.apache.commons.codec.StringEncoder;
34  import org.apache.commons.codec.binary.StringUtils;
35  
36  /**
37   * Codec for the Quoted-Printable section of <a href="https://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
38   * <p>
39   * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to printable characters in the ASCII character
40   * set. It encodes the data in such a way that the resulting octets are unlikely to be modified by mail transport. If the data being encoded are mostly ASCII
41   * text, the encoded form of the data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable to ensure
42   * the integrity of the data should the message pass through a character- translating, and/or line-wrapping gateway.
43   * </p>
44   * <p>
45   * Note:
46   * </p>
47   * <p>
48   * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the quoted-printable spec:
49   * </p>
50   * <ul>
51   * <li>{@code strict=false}: only rules #1 and #2 are implemented</li>
52   * <li>{@code strict=true}: all rules #1 through #5 are implemented</li>
53   * </ul>
54   * <p>
55   * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used for certain applications that do not
56   * require quoted-printable line formatting (rules #3, #4, #5), for instance Q codec. The strict mode has been added in 1.10.
57   * </p>
58   * <p>
59   * This class is immutable and thread-safe.
60   * </p>
61   *
62   * @see <a href="https://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One: Mechanisms for Specifying and Describing
63   *      the Format of Internet Message Bodies </a>
64   *
65   * @since 1.3
66   */
67  public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
68  
69      /**
70       * BitSet of printable characters as defined in RFC 1521.
71       */
72      private static final BitSet PRINTABLE_CHARS = new BitSet(256);
73      private static final byte ESCAPE_CHAR = '=';
74      private static final byte TAB = 9;
75      private static final byte SPACE = 32;
76      private static final byte CR = 13;
77      private static final byte LF = 10;
78  
79      /**
80       * Minimum length required for the byte arrays used by encodeQuotedPrintable method.
81       */
82      private static final int MIN_BYTES = 3;
83  
84      /**
85       * Safe line length for quoted printable encoded text.
86       */
87      private static final int SAFE_LENGTH = 73;
88  
89      // Static initializer for printable chars collection
90      static {
91          // alpha characters
92          for (int i = 33; i <= 60; i++) {
93              PRINTABLE_CHARS.set(i);
94          }
95          for (int i = 62; i <= 126; i++) {
96              PRINTABLE_CHARS.set(i);
97          }
98          PRINTABLE_CHARS.set(TAB);
99          PRINTABLE_CHARS.set(SPACE);
100     }
101 
102     /**
103      * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted back to their original representation.
104      * <p>
105      * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as defined in RFC 1521.
106      * </p>
107      *
108      * @param bytes array of quoted-printable characters.
109      * @return array of original bytes.
110      * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful.
111      */
112     public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
113         if (bytes == null) {
114             return null;
115         }
116         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
117         for (int i = 0; i < bytes.length; i++) {
118             final int b = bytes[i];
119             if (b == ESCAPE_CHAR) {
120                 try {
121                     // if the next octet is a CR we have found a soft line break
122                     if (bytes[++i] == CR) {
123                         continue;
124                     }
125                     final int u = Utils.digit16(bytes[i]);
126                     final int l = Utils.digit16(bytes[++i]);
127                     buffer.write((char) ((u << 4) + l));
128                 } catch (final ArrayIndexOutOfBoundsException e) {
129                     throw new DecoderException("Invalid quoted-printable encoding", e);
130                 }
131             } else if (b != CR && b != LF) {
132                 // every other octet is appended except for CR & LF
133                 buffer.write(b);
134             }
135         }
136         return buffer.toByteArray();
137     }
138 
139     /**
140      * Encodes a byte in the buffer.
141      *
142      * @param b      byte to write.
143      * @param encode indicates whether the octet shall be encoded.
144      * @param buffer the buffer to write to.
145      * @return the number of bytes that have been written to the buffer.
146      */
147     private static int encodeByte(final int b, final boolean encode, final ByteArrayOutputStream buffer) {
148         if (encode) {
149             return encodeQuotedPrintable(b, buffer);
150         }
151         buffer.write(b);
152         return 1;
153     }
154 
155     /**
156      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
157      * <p>
158      * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding
159      * binary data and unformatted text.
160      * </p>
161      *
162      * @param printable bitset of characters deemed quoted-printable.
163      * @param bytes     array of bytes to be encoded.
164      * @return array of bytes containing quoted-printable data.
165      */
166     public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) {
167         return encodeQuotedPrintable(printable, bytes, false);
168     }
169 
170     /**
171      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
172      * <p>
173      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable
174      * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text.
175      * </p>
176      *
177      * @param printable bitset of characters deemed quoted-printable.
178      * @param bytes     array of bytes to be encoded.
179      * @param strict    if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2.
180      * @return array of bytes containing quoted-printable data.
181      * @since 1.10
182      */
183     public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) {
184         if (bytes == null) {
185             return null;
186         }
187         if (printable == null) {
188             printable = PRINTABLE_CHARS;
189         }
190         final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
191         final int bytesLength = bytes.length;
192         if (strict) {
193             if (bytesLength < MIN_BYTES) {
194                 return null;
195             }
196             int pos = 1;
197             // encode up to buffer.length - 3, the last three octets will be treated
198             // separately for simplification of note #3
199             for (int i = 0; i < bytesLength - 3; i++) {
200                 final int b = getUnsignedOctet(i, bytes);
201                 if (pos < SAFE_LENGTH) {
202                     // up to this length it is safe to add any byte, encoded or not
203                     pos += encodeByte(b, !printable.get(b), buffer);
204                 } else {
205                     // rule #3: whitespace at the end of a line *must* be encoded
206                     encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
207                     // rule #5: soft line break
208                     buffer.write(ESCAPE_CHAR);
209                     buffer.write(CR);
210                     buffer.write(LF);
211                     pos = 1;
212                 }
213             }
214             // rule #3: whitespace at the end of a line *must* be encoded
215             // if we would do a soft break line after this octet, encode whitespace
216             int b = getUnsignedOctet(bytesLength - 3, bytes);
217             boolean encode = !printable.get(b) || isWhitespace(b) && pos > SAFE_LENGTH - 5;
218             pos += encodeByte(b, encode, buffer);
219             // note #3: '=' *must not* be the ultimate or penultimate character
220             // simplification: if < 6 bytes left, do a soft line break as we may need
221             // exactly 6 bytes space for the last 2 bytes
222             if (pos > SAFE_LENGTH - 2) {
223                 buffer.write(ESCAPE_CHAR);
224                 buffer.write(CR);
225                 buffer.write(LF);
226             }
227             for (int i = bytesLength - 2; i < bytesLength; i++) {
228                 b = getUnsignedOctet(i, bytes);
229                 // rule #3: trailing whitespace shall be encoded
230                 encode = !printable.get(b) || i > bytesLength - 2 && isWhitespace(b);
231                 encodeByte(b, encode, buffer);
232             }
233         } else {
234             for (final byte c : bytes) {
235                 int b = c;
236                 if (b < 0) {
237                     b = 256 + b;
238                 }
239                 if (printable.get(b)) {
240                     buffer.write(b);
241                 } else {
242                     encodeQuotedPrintable(b, buffer);
243                 }
244             }
245         }
246         return buffer.toByteArray();
247     }
248 
249     /**
250      * Encodes byte into its quoted-printable representation.
251      *
252      * @param b      byte to encode.
253      * @param buffer the buffer to write to.
254      * @return The number of bytes written to the {@code buffer}.
255      */
256     private static int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
257         buffer.write(ESCAPE_CHAR);
258         final char hex1 = Utils.hexChar(b >> 4);
259         final char hex2 = Utils.hexChar(b);
260         buffer.write(hex1);
261         buffer.write(hex2);
262         return 3;
263     }
264 
265     /**
266      * Gets the byte at position {@code index} of the byte array and make sure it is unsigned.
267      *
268      * @param index position in the array.
269      * @param bytes the byte array.
270      * @return the unsigned octet at position {@code index} from the array.
271      */
272     private static int getUnsignedOctet(final int index, final byte[] bytes) {
273         int b = bytes[index];
274         if (b < 0) {
275             b = 256 + b;
276         }
277         return b;
278     }
279 
280     /**
281      * Checks whether the given byte is whitespace.
282      *
283      * @param b byte to be checked.
284      * @return {@code true} if the byte is either a space or tab character.
285      */
286     private static boolean isWhitespace(final int b) {
287         return b == SPACE || b == TAB;
288     }
289 
290     /**
291      * The default Charset used for string decoding and encoding.
292      */
293     private final Charset charset;
294 
295     /**
296      * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
297      */
298     private final boolean strict;
299 
300     /**
301      * Default constructor, assumes default Charset of {@link StandardCharsets#UTF_8}
302      */
303     public QuotedPrintableCodec() {
304         this(StandardCharsets.UTF_8, false);
305     }
306 
307     /**
308      * Constructor which allows for the selection of the strict mode.
309      *
310      * @param strict if {@code true}, soft line breaks will be used.
311      * @since 1.10
312      */
313     public QuotedPrintableCodec(final boolean strict) {
314         this(StandardCharsets.UTF_8, strict);
315     }
316 
317     /**
318      * Constructor which allows for the selection of a default Charset.
319      *
320      * @param charset the default string Charset to use.
321      * @since 1.7
322      */
323     public QuotedPrintableCodec(final Charset charset) {
324         this(charset, false);
325     }
326 
327     /**
328      * Constructor which allows for the selection of a default Charset and strict mode.
329      *
330      * @param charset the default string Charset to use.
331      * @param strict  if {@code true}, soft line breaks will be used.
332      * @since 1.10
333      */
334     public QuotedPrintableCodec(final Charset charset, final boolean strict) {
335         this.charset = charset;
336         this.strict = strict;
337     }
338 
339     /**
340      * Constructor which allows for the selection of a default Charset.
341      *
342      * @param charsetName the default string Charset to use.
343      * @throws UnsupportedCharsetException If no support for the named Charset is available in this instance of the Java virtual machine.
344      * @throws IllegalArgumentException    If the given charsetName is null.
345      * @throws IllegalCharsetNameException If the given Charset name is illegal.
346      *
347      * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable
348      */
349     public QuotedPrintableCodec(final String charsetName) throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
350         this(Charset.forName(charsetName), false);
351     }
352 
353     /**
354      * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted back to their original representation.
355      * <p>
356      * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as defined in RFC 1521.
357      * </p>
358      *
359      * @param bytes array of quoted-printable characters.
360      * @return array of original bytes.
361      * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful.
362      */
363     @Override
364     public byte[] decode(final byte[] bytes) throws DecoderException {
365         return decodeQuotedPrintable(bytes);
366     }
367 
368     /**
369      * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original representation.
370      *
371      * @param obj quoted-printable object to convert into its original form.
372      * @return original object.
373      * @throws DecoderException Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure condition is encountered during the decode
374      *                          process.
375      */
376     @Override
377     public Object decode(final Object obj) throws DecoderException {
378         if (obj == null) {
379             return null;
380         }
381         if (obj instanceof byte[]) {
382             return decode((byte[]) obj);
383         }
384         if (obj instanceof String) {
385             return decode((String) obj);
386         }
387         throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable decoded");
388     }
389 
390     /**
391      * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are converted back to their original
392      * representation.
393      *
394      * @param sourceStr quoted-printable string to convert into its original form.
395      * @return original string.
396      * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported.
397      * @see #getCharset()
398      */
399     @Override
400     public String decode(final String sourceStr) throws DecoderException {
401         return this.decode(sourceStr, getCharset());
402     }
403 
404     /**
405      * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters are converted back to their original
406      * representation.
407      *
408      * @param sourceStr     quoted-printable string to convert into its original form.
409      * @param sourceCharset the original string Charset.
410      * @return original string.
411      * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful.
412      * @since 1.7
413      */
414     public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException {
415         if (sourceStr == null) {
416             return null;
417         }
418         return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
419     }
420 
421     /**
422      * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters are converted back to their original
423      * representation.
424      *
425      * @param sourceStr     quoted-printable string to convert into its original form.
426      * @param sourceCharset the original string Charset.
427      * @return original string.
428      * @throws DecoderException             Thrown if quoted-printable decoding is unsuccessful.
429      * @throws UnsupportedEncodingException Thrown if Charset is not supported.
430      */
431     public String decode(final String sourceStr, final String sourceCharset) throws DecoderException, UnsupportedEncodingException {
432         if (sourceStr == null) {
433             return null;
434         }
435         return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
436     }
437 
438     /**
439      * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
440      * <p>
441      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable
442      * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text.
443      * </p>
444      *
445      * @param bytes array of bytes to be encoded.
446      * @return array of bytes containing quoted-printable data.
447      */
448     @Override
449     public byte[] encode(final byte[] bytes) {
450         return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
451     }
452 
453     /**
454      * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
455      *
456      * @param obj string to convert to a quoted-printable form.
457      * @return quoted-printable object.
458      * @throws EncoderException Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is unsuccessful.
459      */
460     @Override
461     public Object encode(final Object obj) throws EncoderException {
462         if (obj == null) {
463             return null;
464         }
465         if (obj instanceof byte[]) {
466             return encode((byte[]) obj);
467         }
468         if (obj instanceof String) {
469             return encode((String) obj);
470         }
471         throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable encoded");
472     }
473 
474     /**
475      * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped.
476      * <p>
477      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable
478      * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text.
479      * </p>
480      *
481      * @param sourceStr string to convert to quoted-printable form.
482      * @return quoted-printable string.
483      * @throws EncoderException Thrown if quoted-printable encoding is unsuccessful.
484      *
485      * @see #getCharset()
486      */
487     @Override
488     public String encode(final String sourceStr) throws EncoderException {
489         return encode(sourceStr, getCharset());
490     }
491 
492     /**
493      * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
494      * <p>
495      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable
496      * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text.
497      * </p>
498      *
499      * @param sourceStr     string to convert to quoted-printable form.
500      * @param sourceCharset the Charset for sourceStr.
501      * @return quoted-printable string.
502      * @since 1.7
503      */
504     public String encode(final String sourceStr, final Charset sourceCharset) {
505         if (sourceStr == null) {
506             return null;
507         }
508         return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset)));
509     }
510 
511     /**
512      * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
513      * <p>
514      * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable
515      * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text.
516      * </p>
517      *
518      * @param sourceStr     string to convert to quoted-printable form.
519      * @param sourceCharset the Charset for sourceStr.
520      * @return quoted-printable string.
521      * @throws UnsupportedEncodingException Thrown if the Charset is not supported.
522      */
523     public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException {
524         if (sourceStr == null) {
525             return null;
526         }
527         return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset)));
528     }
529 
530     /**
531      * Gets the default Charset name used for string decoding and encoding.
532      *
533      * @return the default Charset name.
534      * @since 1.7
535      */
536     public Charset getCharset() {
537         return this.charset;
538     }
539 
540     /**
541      * Gets the default Charset name used for string decoding and encoding.
542      *
543      * @return the default Charset name.
544      */
545     public String getDefaultCharset() {
546         return this.charset.name();
547     }
548 }