1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.net;
19
20 import java.io.ByteArrayOutputStream;
21 import java.io.UnsupportedEncodingException;
22 import java.nio.charset.Charset;
23 import java.nio.charset.IllegalCharsetNameException;
24 import java.nio.charset.StandardCharsets;
25 import java.nio.charset.UnsupportedCharsetException;
26 import java.util.BitSet;
27
28 import org.apache.commons.codec.BinaryDecoder;
29 import org.apache.commons.codec.BinaryEncoder;
30 import org.apache.commons.codec.DecoderException;
31 import org.apache.commons.codec.EncoderException;
32 import org.apache.commons.codec.StringDecoder;
33 import org.apache.commons.codec.StringEncoder;
34 import org.apache.commons.codec.binary.StringUtils;
35
36 /**
37 * Codec for the Quoted-Printable section of <a href="https://www.ietf.org/rfc/rfc1521.txt">RFC 1521</a>.
38 * <p>
39 * The Quoted-Printable encoding is intended to represent data that largely consists of octets that correspond to printable characters in the ASCII character
40 * set. It encodes the data in such a way that the resulting octets are unlikely to be modified by mail transport. If the data being encoded are mostly ASCII
41 * text, the encoded form of the data remains largely recognizable by humans. A body which is entirely ASCII may also be encoded in Quoted-Printable to ensure
42 * the integrity of the data should the message pass through a character- translating, and/or line-wrapping gateway.
43 * </p>
44 * <p>
45 * Note:
46 * </p>
47 * <p>
48 * Depending on the selected {@code strict} parameter, this class will implement a different set of rules of the quoted-printable spec:
49 * </p>
50 * <ul>
51 * <li>{@code strict=false}: only rules #1 and #2 are implemented</li>
52 * <li>{@code strict=true}: all rules #1 through #5 are implemented</li>
53 * </ul>
54 * <p>
55 * Originally, this class only supported the non-strict mode, but the codec in this partial form could already be used for certain applications that do not
56 * require quoted-printable line formatting (rules #3, #4, #5), for instance Q codec. The strict mode has been added in 1.10.
57 * </p>
58 * <p>
59 * This class is immutable and thread-safe.
60 * </p>
61 *
62 * @see <a href="https://www.ietf.org/rfc/rfc1521.txt">RFC 1521 MIME (Multipurpose Internet Mail Extensions) Part One: Mechanisms for Specifying and Describing
63 * the Format of Internet Message Bodies </a>
64 *
65 * @since 1.3
66 */
67 public class QuotedPrintableCodec implements BinaryEncoder, BinaryDecoder, StringEncoder, StringDecoder {
68
69 /**
70 * BitSet of printable characters as defined in RFC 1521.
71 */
72 private static final BitSet PRINTABLE_CHARS = new BitSet(256);
73 private static final byte ESCAPE_CHAR = '=';
74 private static final byte TAB = 9;
75 private static final byte SPACE = 32;
76 private static final byte CR = 13;
77 private static final byte LF = 10;
78
79 /**
80 * Minimum length required for the byte arrays used by encodeQuotedPrintable method.
81 */
82 private static final int MIN_BYTES = 3;
83
84 /**
85 * Safe line length for quoted printable encoded text.
86 */
87 private static final int SAFE_LENGTH = 73;
88
89 // Static initializer for printable chars collection
90 static {
91 // alpha characters
92 for (int i = 33; i <= 60; i++) {
93 PRINTABLE_CHARS.set(i);
94 }
95 for (int i = 62; i <= 126; i++) {
96 PRINTABLE_CHARS.set(i);
97 }
98 PRINTABLE_CHARS.set(TAB);
99 PRINTABLE_CHARS.set(SPACE);
100 }
101
102 /**
103 * Decodes an array quoted-printable characters into an array of original bytes. Escaped characters are converted back to their original representation.
104 * <p>
105 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as defined in RFC 1521.
106 * </p>
107 *
108 * @param bytes array of quoted-printable characters.
109 * @return array of original bytes.
110 * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful.
111 */
112 public static final byte[] decodeQuotedPrintable(final byte[] bytes) throws DecoderException {
113 if (bytes == null) {
114 return null;
115 }
116 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
117 for (int i = 0; i < bytes.length; i++) {
118 final int b = bytes[i];
119 if (b == ESCAPE_CHAR) {
120 try {
121 // if the next octet is a CR we have found a soft line break
122 if (bytes[++i] == CR) {
123 continue;
124 }
125 final int u = Utils.digit16(bytes[i]);
126 final int l = Utils.digit16(bytes[++i]);
127 buffer.write((char) ((u << 4) + l));
128 } catch (final ArrayIndexOutOfBoundsException e) {
129 throw new DecoderException("Invalid quoted-printable encoding", e);
130 }
131 } else if (b != CR && b != LF) {
132 // every other octet is appended except for CR & LF
133 buffer.write(b);
134 }
135 }
136 return buffer.toByteArray();
137 }
138
139 /**
140 * Encodes a byte in the buffer.
141 *
142 * @param b byte to write.
143 * @param encode indicates whether the octet shall be encoded.
144 * @param buffer the buffer to write to.
145 * @return the number of bytes that have been written to the buffer.
146 */
147 private static int encodeByte(final int b, final boolean encode, final ByteArrayOutputStream buffer) {
148 if (encode) {
149 return encodeQuotedPrintable(b, buffer);
150 }
151 buffer.write(b);
152 return 1;
153 }
154
155 /**
156 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
157 * <p>
158 * This function implements a subset of quoted-printable encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding
159 * binary data and unformatted text.
160 * </p>
161 *
162 * @param printable bitset of characters deemed quoted-printable.
163 * @param bytes array of bytes to be encoded.
164 * @return array of bytes containing quoted-printable data.
165 */
166 public static final byte[] encodeQuotedPrintable(final BitSet printable, final byte[] bytes) {
167 return encodeQuotedPrintable(printable, bytes, false);
168 }
169
170 /**
171 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
172 * <p>
173 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable
174 * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text.
175 * </p>
176 *
177 * @param printable bitset of characters deemed quoted-printable.
178 * @param bytes array of bytes to be encoded.
179 * @param strict if {@code true} the full ruleset is used, otherwise only rule #1 and rule #2.
180 * @return array of bytes containing quoted-printable data.
181 * @since 1.10
182 */
183 public static final byte[] encodeQuotedPrintable(BitSet printable, final byte[] bytes, final boolean strict) {
184 if (bytes == null) {
185 return null;
186 }
187 if (printable == null) {
188 printable = PRINTABLE_CHARS;
189 }
190 final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
191 final int bytesLength = bytes.length;
192 if (strict) {
193 if (bytesLength < MIN_BYTES) {
194 return null;
195 }
196 int pos = 1;
197 // encode up to buffer.length - 3, the last three octets will be treated
198 // separately for simplification of note #3
199 for (int i = 0; i < bytesLength - 3; i++) {
200 final int b = getUnsignedOctet(i, bytes);
201 if (pos < SAFE_LENGTH) {
202 // up to this length it is safe to add any byte, encoded or not
203 pos += encodeByte(b, !printable.get(b), buffer);
204 } else {
205 // rule #3: whitespace at the end of a line *must* be encoded
206 encodeByte(b, !printable.get(b) || isWhitespace(b), buffer);
207 // rule #5: soft line break
208 buffer.write(ESCAPE_CHAR);
209 buffer.write(CR);
210 buffer.write(LF);
211 pos = 1;
212 }
213 }
214 // rule #3: whitespace at the end of a line *must* be encoded
215 // if we would do a soft break line after this octet, encode whitespace
216 int b = getUnsignedOctet(bytesLength - 3, bytes);
217 boolean encode = !printable.get(b) || isWhitespace(b) && pos > SAFE_LENGTH - 5;
218 pos += encodeByte(b, encode, buffer);
219 // note #3: '=' *must not* be the ultimate or penultimate character
220 // simplification: if < 6 bytes left, do a soft line break as we may need
221 // exactly 6 bytes space for the last 2 bytes
222 if (pos > SAFE_LENGTH - 2) {
223 buffer.write(ESCAPE_CHAR);
224 buffer.write(CR);
225 buffer.write(LF);
226 }
227 for (int i = bytesLength - 2; i < bytesLength; i++) {
228 b = getUnsignedOctet(i, bytes);
229 // rule #3: trailing whitespace shall be encoded
230 encode = !printable.get(b) || i > bytesLength - 2 && isWhitespace(b);
231 encodeByte(b, encode, buffer);
232 }
233 } else {
234 for (final byte c : bytes) {
235 int b = c;
236 if (b < 0) {
237 b = 256 + b;
238 }
239 if (printable.get(b)) {
240 buffer.write(b);
241 } else {
242 encodeQuotedPrintable(b, buffer);
243 }
244 }
245 }
246 return buffer.toByteArray();
247 }
248
249 /**
250 * Encodes byte into its quoted-printable representation.
251 *
252 * @param b byte to encode.
253 * @param buffer the buffer to write to.
254 * @return The number of bytes written to the {@code buffer}.
255 */
256 private static int encodeQuotedPrintable(final int b, final ByteArrayOutputStream buffer) {
257 buffer.write(ESCAPE_CHAR);
258 final char hex1 = Utils.hexChar(b >> 4);
259 final char hex2 = Utils.hexChar(b);
260 buffer.write(hex1);
261 buffer.write(hex2);
262 return 3;
263 }
264
265 /**
266 * Gets the byte at position {@code index} of the byte array and make sure it is unsigned.
267 *
268 * @param index position in the array.
269 * @param bytes the byte array.
270 * @return the unsigned octet at position {@code index} from the array.
271 */
272 private static int getUnsignedOctet(final int index, final byte[] bytes) {
273 int b = bytes[index];
274 if (b < 0) {
275 b = 256 + b;
276 }
277 return b;
278 }
279
280 /**
281 * Checks whether the given byte is whitespace.
282 *
283 * @param b byte to be checked.
284 * @return {@code true} if the byte is either a space or tab character.
285 */
286 private static boolean isWhitespace(final int b) {
287 return b == SPACE || b == TAB;
288 }
289
290 /**
291 * The default Charset used for string decoding and encoding.
292 */
293 private final Charset charset;
294
295 /**
296 * Indicates whether soft line breaks shall be used during encoding (rule #3-5).
297 */
298 private final boolean strict;
299
300 /**
301 * Default constructor, assumes default Charset of {@link StandardCharsets#UTF_8}
302 */
303 public QuotedPrintableCodec() {
304 this(StandardCharsets.UTF_8, false);
305 }
306
307 /**
308 * Constructor which allows for the selection of the strict mode.
309 *
310 * @param strict if {@code true}, soft line breaks will be used.
311 * @since 1.10
312 */
313 public QuotedPrintableCodec(final boolean strict) {
314 this(StandardCharsets.UTF_8, strict);
315 }
316
317 /**
318 * Constructor which allows for the selection of a default Charset.
319 *
320 * @param charset the default string Charset to use.
321 * @since 1.7
322 */
323 public QuotedPrintableCodec(final Charset charset) {
324 this(charset, false);
325 }
326
327 /**
328 * Constructor which allows for the selection of a default Charset and strict mode.
329 *
330 * @param charset the default string Charset to use.
331 * @param strict if {@code true}, soft line breaks will be used.
332 * @since 1.10
333 */
334 public QuotedPrintableCodec(final Charset charset, final boolean strict) {
335 this.charset = charset;
336 this.strict = strict;
337 }
338
339 /**
340 * Constructor which allows for the selection of a default Charset.
341 *
342 * @param charsetName the default string Charset to use.
343 * @throws UnsupportedCharsetException If no support for the named Charset is available in this instance of the Java virtual machine.
344 * @throws IllegalArgumentException If the given charsetName is null.
345 * @throws IllegalCharsetNameException If the given Charset name is illegal.
346 *
347 * @since 1.7 throws UnsupportedCharsetException if the named Charset is unavailable
348 */
349 public QuotedPrintableCodec(final String charsetName) throws IllegalCharsetNameException, IllegalArgumentException, UnsupportedCharsetException {
350 this(Charset.forName(charsetName), false);
351 }
352
353 /**
354 * Decodes an array of quoted-printable characters into an array of original bytes. Escaped characters are converted back to their original representation.
355 * <p>
356 * This function fully implements the quoted-printable encoding specification (rule #1 through rule #5) as defined in RFC 1521.
357 * </p>
358 *
359 * @param bytes array of quoted-printable characters.
360 * @return array of original bytes.
361 * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful.
362 */
363 @Override
364 public byte[] decode(final byte[] bytes) throws DecoderException {
365 return decodeQuotedPrintable(bytes);
366 }
367
368 /**
369 * Decodes a quoted-printable object into its original form. Escaped characters are converted back to their original representation.
370 *
371 * @param obj quoted-printable object to convert into its original form.
372 * @return original object.
373 * @throws DecoderException Thrown if the argument is not a {@code String} or {@code byte[]}. Thrown if a failure condition is encountered during the decode
374 * process.
375 */
376 @Override
377 public Object decode(final Object obj) throws DecoderException {
378 if (obj == null) {
379 return null;
380 }
381 if (obj instanceof byte[]) {
382 return decode((byte[]) obj);
383 }
384 if (obj instanceof String) {
385 return decode((String) obj);
386 }
387 throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable decoded");
388 }
389
390 /**
391 * Decodes a quoted-printable string into its original form using the default string Charset. Escaped characters are converted back to their original
392 * representation.
393 *
394 * @param sourceStr quoted-printable string to convert into its original form.
395 * @return original string.
396 * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful. Thrown if Charset is not supported.
397 * @see #getCharset()
398 */
399 @Override
400 public String decode(final String sourceStr) throws DecoderException {
401 return this.decode(sourceStr, getCharset());
402 }
403
404 /**
405 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters are converted back to their original
406 * representation.
407 *
408 * @param sourceStr quoted-printable string to convert into its original form.
409 * @param sourceCharset the original string Charset.
410 * @return original string.
411 * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful.
412 * @since 1.7
413 */
414 public String decode(final String sourceStr, final Charset sourceCharset) throws DecoderException {
415 if (sourceStr == null) {
416 return null;
417 }
418 return new String(this.decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
419 }
420
421 /**
422 * Decodes a quoted-printable string into its original form using the specified string Charset. Escaped characters are converted back to their original
423 * representation.
424 *
425 * @param sourceStr quoted-printable string to convert into its original form.
426 * @param sourceCharset the original string Charset.
427 * @return original string.
428 * @throws DecoderException Thrown if quoted-printable decoding is unsuccessful.
429 * @throws UnsupportedEncodingException Thrown if Charset is not supported.
430 */
431 public String decode(final String sourceStr, final String sourceCharset) throws DecoderException, UnsupportedEncodingException {
432 if (sourceStr == null) {
433 return null;
434 }
435 return new String(decode(StringUtils.getBytesUsAscii(sourceStr)), sourceCharset);
436 }
437
438 /**
439 * Encodes an array of bytes into an array of quoted-printable 7-bit characters. Unsafe characters are escaped.
440 * <p>
441 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable
442 * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text.
443 * </p>
444 *
445 * @param bytes array of bytes to be encoded.
446 * @return array of bytes containing quoted-printable data.
447 */
448 @Override
449 public byte[] encode(final byte[] bytes) {
450 return encodeQuotedPrintable(PRINTABLE_CHARS, bytes, strict);
451 }
452
453 /**
454 * Encodes an object into its quoted-printable safe form. Unsafe characters are escaped.
455 *
456 * @param obj string to convert to a quoted-printable form.
457 * @return quoted-printable object.
458 * @throws EncoderException Thrown if quoted-printable encoding is not applicable to objects of this type or if encoding is unsuccessful.
459 */
460 @Override
461 public Object encode(final Object obj) throws EncoderException {
462 if (obj == null) {
463 return null;
464 }
465 if (obj instanceof byte[]) {
466 return encode((byte[]) obj);
467 }
468 if (obj instanceof String) {
469 return encode((String) obj);
470 }
471 throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be quoted-printable encoded");
472 }
473
474 /**
475 * Encodes a string into its quoted-printable form using the default string Charset. Unsafe characters are escaped.
476 * <p>
477 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable
478 * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text.
479 * </p>
480 *
481 * @param sourceStr string to convert to quoted-printable form.
482 * @return quoted-printable string.
483 * @throws EncoderException Thrown if quoted-printable encoding is unsuccessful.
484 *
485 * @see #getCharset()
486 */
487 @Override
488 public String encode(final String sourceStr) throws EncoderException {
489 return encode(sourceStr, getCharset());
490 }
491
492 /**
493 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
494 * <p>
495 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable
496 * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text.
497 * </p>
498 *
499 * @param sourceStr string to convert to quoted-printable form.
500 * @param sourceCharset the Charset for sourceStr.
501 * @return quoted-printable string.
502 * @since 1.7
503 */
504 public String encode(final String sourceStr, final Charset sourceCharset) {
505 if (sourceStr == null) {
506 return null;
507 }
508 return StringUtils.newStringUsAscii(this.encode(sourceStr.getBytes(sourceCharset)));
509 }
510
511 /**
512 * Encodes a string into its quoted-printable form using the specified Charset. Unsafe characters are escaped.
513 * <p>
514 * Depending on the selection of the {@code strict} parameter, this function either implements the full ruleset or only a subset of quoted-printable
515 * encoding specification (rule #1 and rule #2) as defined in RFC 1521 and is suitable for encoding binary data and unformatted text.
516 * </p>
517 *
518 * @param sourceStr string to convert to quoted-printable form.
519 * @param sourceCharset the Charset for sourceStr.
520 * @return quoted-printable string.
521 * @throws UnsupportedEncodingException Thrown if the Charset is not supported.
522 */
523 public String encode(final String sourceStr, final String sourceCharset) throws UnsupportedEncodingException {
524 if (sourceStr == null) {
525 return null;
526 }
527 return StringUtils.newStringUsAscii(encode(sourceStr.getBytes(sourceCharset)));
528 }
529
530 /**
531 * Gets the default Charset name used for string decoding and encoding.
532 *
533 * @return the default Charset name.
534 * @since 1.7
535 */
536 public Charset getCharset() {
537 return this.charset;
538 }
539
540 /**
541 * Gets the default Charset name used for string decoding and encoding.
542 *
543 * @return the default Charset name.
544 */
545 public String getDefaultCharset() {
546 return this.charset.name();
547 }
548 }