View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.imaging.formats.jpeg.iptc;
19  
20  import java.io.ByteArrayInputStream;
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.nio.ByteOrder;
25  import java.nio.charset.Charset;
26  import java.nio.charset.StandardCharsets;
27  import java.util.ArrayList;
28  import java.util.Arrays;
29  import java.util.Comparator;
30  import java.util.List;
31  import java.util.Objects;
32  import java.util.logging.Level;
33  import java.util.logging.Logger;
34  
35  import org.apache.commons.imaging.ImagingConstants;
36  import org.apache.commons.imaging.ImagingException;
37  import org.apache.commons.imaging.ImagingParameters;
38  import org.apache.commons.imaging.common.AbstractBinaryOutputStream;
39  import org.apache.commons.imaging.common.Allocator;
40  import org.apache.commons.imaging.common.BinaryFileParser;
41  import org.apache.commons.imaging.common.BinaryFunctions;
42  import org.apache.commons.imaging.common.ByteConversions;
43  import org.apache.commons.imaging.formats.jpeg.JpegConstants;
44  import org.apache.commons.imaging.formats.jpeg.JpegImagingParameters;
45  import org.apache.commons.imaging.internal.Debug;
46  
47  public class IptcParser extends BinaryFileParser {
48  
49      private static final Logger LOGGER = Logger.getLogger(IptcParser.class.getName());
50  
51      private static final ByteOrder APP13_BYTE_ORDER = ByteOrder.BIG_ENDIAN;
52  
53      /**
54       * Block types (or Image Resource IDs) that are not recommended to be interpreted when libraries process Photoshop IPTC metadata.
55       *
56       * @see <a href="https://www.adobe.com/devnet-apps/photoshop/fileformatashtml/">Adobe Photoshop File Formats Specification</a>
57       * @see <a href="https://issues.apache.org/jira/browse/IMAGING-246">IMAGING-246</a>
58       * @since 1.0-alpha2
59       */
60      private static final List<Integer> PHOTOSHOP_IGNORED_BLOCK_TYPE = Arrays.asList(1084, 1085, 1086, 1087);
61  
62      private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
63      private static final int ENV_TAG_CODED_CHARACTER_SET = 90;
64      private static final byte[] CHARACTER_ESCAPE_SEQUENCE = { '\u001B', '%', 'G' };
65  
66      /**
67       * Constructs a new instance with the default, big-endian, byte order.
68       */
69      public IptcParser() {
70          // empty
71      }
72  
73      private Charset findCharset(final byte[] codedCharset) {
74          final String codedCharsetString = new String(codedCharset, StandardCharsets.ISO_8859_1);
75          try {
76              if (Charset.isSupported(codedCharsetString)) {
77                  return Charset.forName(codedCharsetString);
78              }
79          } catch (final IllegalArgumentException ignored) {
80              // ignored
81          }
82          // check if encoding is a escape sequence
83          // normalize encoding byte sequence
84          final byte[] codedCharsetNormalized = Allocator.byteArray(codedCharset.length);
85          int j = 0;
86          for (final byte element : codedCharset) {
87              if (element != ' ') {
88                  codedCharsetNormalized[j++] = element;
89              }
90          }
91  
92          if (Objects.deepEquals(codedCharsetNormalized, CHARACTER_ESCAPE_SEQUENCE)) {
93              return StandardCharsets.UTF_8;
94          }
95          return DEFAULT_CHARSET;
96      }
97  
98      public boolean isPhotoshopJpegSegment(final byte[] segmentData) {
99          if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.isStartOf(segmentData)) {
100             return false;
101         }
102 
103         final int index = JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size();
104         return index + 4 <= segmentData.length && ByteConversions.toInt(segmentData, index, APP13_BYTE_ORDER) == JpegConstants.CONST_8BIM;
105     }
106 
107     protected List<IptcBlock> parseAllBlocks(final byte[] bytes, final boolean strict) throws ImagingException, IOException {
108         final List<IptcBlock> blocks = new ArrayList<>();
109 
110         try (InputStream bis = new ByteArrayInputStream(bytes)) {
111 
112             // Note that these are unsigned quantities. Name is always an even
113             // number of bytes (including the 1st byte, which is the size.)
114 
115             final byte[] idString = BinaryFunctions.readBytes("", bis, JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(),
116                     "App13 Segment missing identification string");
117             if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.equals(idString)) {
118                 throw new ImagingException("Not a Photoshop App13 Segment");
119             }
120 
121             // int index = PHOTOSHOP_IDENTIFICATION_STRING.length;
122 
123             while (true) {
124                 final int imageResourceBlockSignature;
125                 try {
126                     imageResourceBlockSignature = BinaryFunctions.read4Bytes("", bis, "Image Resource Block missing identification string", APP13_BYTE_ORDER);
127                 } catch (final IOException ioEx) {
128                     break;
129                 }
130                 if (imageResourceBlockSignature != JpegConstants.CONST_8BIM) {
131                     throw new ImagingException("Invalid Image Resource Block Signature");
132                 }
133 
134                 final int blockType = BinaryFunctions.read2Bytes("", bis, "Image Resource Block missing type", APP13_BYTE_ORDER);
135                 Debug.debug("blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")");
136 
137                 // skip blocks that the photoshop spec recommends to, see IMAGING-246
138                 if (PHOTOSHOP_IGNORED_BLOCK_TYPE.contains(blockType)) {
139                     Debug.debug("Skipping blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")");
140                     // if there is still data in this block, before the next image resource block
141                     // (8BIM), then we must consume these bytes to leave a pointer ready to read
142                     // the next block
143                     BinaryFunctions.searchQuad(JpegConstants.CONST_8BIM, bis);
144                     continue;
145                 }
146 
147                 final int blockNameLength = BinaryFunctions.readByte("Name length", bis, "Image Resource Block missing name length");
148                 if (blockNameLength > 0) {
149                     Debug.debug("blockNameLength: " + blockNameLength + " (0x" + Integer.toHexString(blockNameLength) + ")");
150                 }
151                 final byte[] blockNameBytes;
152                 if (blockNameLength == 0) {
153                     BinaryFunctions.readByte("Block name bytes", bis, "Image Resource Block has invalid name");
154                     blockNameBytes = ImagingConstants.EMPTY_BYTE_ARRAY;
155                 } else {
156                     try {
157                         blockNameBytes = BinaryFunctions.readBytes("", bis, blockNameLength, "Invalid Image Resource Block name");
158                     } catch (final IOException ioEx) {
159                         if (strict) {
160                             throw ioEx;
161                         }
162                         break;
163                     }
164 
165                     if (blockNameLength % 2 == 0) {
166                         BinaryFunctions.readByte("Padding byte", bis, "Image Resource Block missing padding byte");
167                     }
168                 }
169 
170                 final int blockSize = BinaryFunctions.read4Bytes("", bis, "Image Resource Block missing size", APP13_BYTE_ORDER);
171                 Debug.debug("blockSize: " + blockSize + " (0x" + Integer.toHexString(blockSize) + ")");
172 
173                 /*
174                  * doesn't catch cases where blocksize is invalid but is still less than bytes.length but will at least prevent OutOfMemory errors
175                  */
176                 if (blockSize > bytes.length) {
177                     throw new ImagingException("Invalid Block Size : " + blockSize + " > " + bytes.length);
178                 }
179 
180                 final byte[] blockData;
181                 try {
182                     blockData = BinaryFunctions.readBytes("", bis, blockSize, "Invalid Image Resource Block data");
183                 } catch (final IOException ioEx) {
184                     if (strict) {
185                         throw ioEx;
186                     }
187                     break;
188                 }
189 
190                 blocks.add(new IptcBlock(blockType, blockNameBytes, blockData));
191 
192                 if (blockSize % 2 != 0) {
193                     BinaryFunctions.readByte("Padding byte", bis, "Image Resource Block missing padding byte");
194                 }
195             }
196 
197             return blocks;
198         }
199     }
200 
201     protected List<IptcRecord> parseIptcBlock(final byte[] bytes) {
202         Charset charset = DEFAULT_CHARSET;
203         final List<IptcRecord> elements = new ArrayList<>();
204 
205         int index = 0;
206         // Integer recordVersion = null;
207         while (index + 1 < bytes.length) {
208             final int tagMarker = 0xff & bytes[index++];
209             Debug.debug("tagMarker: " + tagMarker + " (0x" + Integer.toHexString(tagMarker) + ")");
210 
211             if (tagMarker != IptcConstants.IPTC_RECORD_TAG_MARKER) {
212                 if (LOGGER.isLoggable(Level.FINE)) {
213                     LOGGER.fine("Unexpected record tag marker in IPTC data.");
214                 }
215                 return elements;
216             }
217 
218             final int recordNumber = 0xff & bytes[index++];
219             Debug.debug("recordNumber: " + recordNumber + " (0x" + Integer.toHexString(recordNumber) + ")");
220 
221             // int recordPrefix = convertByteArrayToShort("recordPrefix", index,
222             // bytes);
223             // if (verbose)
224             // Debug.debug("recordPrefix", recordPrefix + " (0x"
225             // + Integer.toHexString(recordPrefix) + ")");
226             // index += 2;
227             //
228             // if (recordPrefix != IPTC_RECORD_PREFIX)
229             // {
230             // if (verbose)
231             // System.out
232             // .println("Unexpected record prefix in IPTC data!");
233             // return elements;
234             // }
235 
236             // throw new ImageReadException(
237             // "Unexpected record prefix in IPTC data.");
238 
239             final int recordType = 0xff & bytes[index];
240             Debug.debug("recordType: " + recordType + " (0x" + Integer.toHexString(recordType) + ")");
241             index++;
242 
243             final int recordSize = ByteConversions.toUInt16(bytes, index, getByteOrder());
244             index += 2;
245 
246             final boolean extendedDataset = recordSize > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE;
247             final int dataFieldCountLength = recordSize & 0x7fff;
248             if (extendedDataset) {
249                 Debug.debug("extendedDataset. dataFieldCountLength: " + dataFieldCountLength);
250             }
251             if (extendedDataset) {
252                 // ignore extended dataset and everything after.
253                 return elements;
254             }
255 
256             final byte[] recordData = BinaryFunctions.copyOfRange(bytes, index, recordSize);
257             index += recordSize;
258 
259             // Debug.debug("recordSize", recordSize + " (0x"
260             // + Integer.toHexString(recordSize) + ")");
261 
262             if (recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == ENV_TAG_CODED_CHARACTER_SET) {
263                 charset = findCharset(recordData);
264                 continue;
265             }
266 
267             if (recordNumber != IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER) {
268                 continue;
269             }
270 
271             if (recordType == 0) {
272                 if (LOGGER.isLoggable(Level.FINE)) {
273                     LOGGER.fine("ignore record version record! " + elements.size());
274                 }
275                 // ignore "record version" record;
276                 continue;
277             }
278             // if (recordVersion == null)
279             // {
280             // // The first record in a JPEG/Photoshop IPTC block must be
281             // // the record version.
282             // if (recordType != 0)
283             // throw new ImageReadException("Missing record version: "
284             // + recordType);
285             // recordVersion = new Integer(convertByteArrayToShort(
286             // "recordNumber", recordData));
287             //
288             // if (recordSize != 2)
289             // throw new ImageReadException(
290             // "Invalid record version record size: " + recordSize);
291             //
292             // // JPEG/Photoshop IPTC metadata is always in Record version
293             // // 2
294             // if (recordVersion.intValue() != 2)
295             // throw new ImageReadException(
296             // "Invalid IPTC record version: " + recordVersion);
297             //
298             // // Debug.debug("recordVersion", recordVersion);
299             // continue;
300             // }
301 
302             final String value = new String(recordData, charset);
303 
304             final IptcType iptcType = IptcTypeLookup.getIptcType(recordType);
305 
306             // Debug.debug("iptcType", iptcType);
307             // debugByteArray("iptcData", iptcData);
308             // Debug.debug();
309 
310             // if (recordType == IPTC_TYPE_CREDIT.type
311             // || recordType == IPTC_TYPE_OBJECT_NAME.type)
312             // {
313             // this.debugByteArray("recordData", recordData);
314             // Debug.debug("index", IPTC_TYPE_CREDIT.name);
315             // }
316 
317             final IptcRecord element = new IptcRecord(iptcType, value);
318             elements.add(element);
319         }
320 
321         return elements;
322     }
323 
324     public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final boolean strict) throws ImagingException, IOException {
325         final List<IptcRecord> records = new ArrayList<>();
326 
327         final List<IptcBlock> blocks = parseAllBlocks(bytes, strict);
328 
329         for (final IptcBlock block : blocks) {
330             // Ignore everything but IPTC data.
331             if (!block.isIptcBlock()) {
332                 continue;
333             }
334 
335             records.addAll(parseIptcBlock(block.getBlockData()));
336         }
337 
338         return new PhotoshopApp13Data(records, blocks);
339     }
340 
341     // private void writeIPTCRecord(BinaryOutputStream bos, )
342 
343     /*
344      * In practice, App13 segments are only used for Photoshop/IPTC metadata. However, we should not treat App13 signatures without Photoshop's signature as
345      * Photoshop/IPTC segments.
346      *
347      * A Photoshop/IPTC App13 segment begins with the Photoshop Identification string.
348      *
349      * There follows 0-N blocks (Photoshop calls them "Image Resource Blocks").
350      *
351      * Each block has the following structure:
352      *
353      * 1. 4-byte type. This is always "8BIM" for blocks in a Photoshop App13 segment. 2. 2-byte id. IPTC data is stored in blocks with id 0x0404, aka.
354      * IPTC_NAA_RECORD_IMAGE_RESOURCE_ID 3. Block name as a Pascal String. This is padded to have an even length. 4. 4-byte size (in bytes). 5. Block data. This
355      * is also padded to have an even length.
356      *
357      * The block data consists of a 0-N records. A record has the following structure:
358      *
359      * 1. 2-byte prefix. The value is always 0x1C02 2. 1-byte record type. The record types are documented by the IPTC. See IptcConstants. 3. 2-byte record size
360      * (in bytes). 4. Record data, "record size" bytes long.
361      *
362      * Record data (unlike block data) is NOT padded to have an even length.
363      *
364      * Record data, for IPTC record, should always be ISO-8859-1. But according to SANSELAN-33, this isn't always the case.
365      *
366      * The exception is the first record in the block, which must always be a record version record, whose value is a two-byte number; the value is 0x02.
367      *
368      * Some IPTC blocks are missing this first "record version" record, so we don't require it.
369      */
370     public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final ImagingParameters<JpegImagingParameters> params)
371             throws ImagingException, IOException {
372         final boolean strict = params != null && params.isStrict();
373 
374         return parsePhotoshopSegment(bytes, strict);
375     }
376 
377     public byte[] writeIptcBlock(final List<IptcRecord> elements) throws ImagingException, IOException {
378         return writeIptcBlock(elements, false);
379     }
380 
381     public byte[] writeIptcBlock(List<IptcRecord> elements, final boolean forceUtf8Encoding) throws ImagingException, IOException {
382         Charset charset;
383         if (forceUtf8Encoding) {
384             // Using UTF-8 is forced
385             charset = StandardCharsets.UTF_8;
386         } else {
387             // Check if all values can be converted to bytes with DEFAULT_CHARSET,
388             // otherwise use UTF-8
389             charset = DEFAULT_CHARSET;
390             for (final IptcRecord element : elements) {
391                 final byte[] recordData = element.getValue().getBytes(charset);
392                 if (!new String(recordData, charset).equals(element.getValue())) {
393                     charset = StandardCharsets.UTF_8;
394                     break;
395                 }
396             }
397         }
398         final ByteArrayOutputStream baos = new ByteArrayOutputStream();
399         try (AbstractBinaryOutputStream bos = AbstractBinaryOutputStream.create(baos, getByteOrder())) {
400             if (!charset.equals(DEFAULT_CHARSET)) {
401                 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
402                 bos.write(IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER);
403                 bos.write(ENV_TAG_CODED_CHARACTER_SET);
404                 final byte[] codedCharset = CHARACTER_ESCAPE_SEQUENCE;
405                 bos.write2Bytes(codedCharset.length);
406                 bos.write(codedCharset);
407             }
408 
409             // first, right record version record
410             bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
411             bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER);
412             bos.write(IptcTypes.RECORD_VERSION.type); // record version record
413                                                       // type.
414             bos.write2Bytes(2); // record version record size
415             bos.write2Bytes(2); // record version value
416 
417             // make a copy of the list.
418             elements = new ArrayList<>(elements);
419 
420             // sort the list. Records must be in numerical order.
421             final Comparator<IptcRecord> comparator = (e1, e2) -> e2.iptcType.getType() - e1.iptcType.getType();
422             elements.sort(comparator);
423             // TODO: make sure order right
424 
425             // write the list.
426             for (final IptcRecord element : elements) {
427                 if (element.iptcType == IptcTypes.RECORD_VERSION) {
428                     continue; // ignore
429                 }
430 
431                 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
432                 bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER);
433                 if (element.iptcType.getType() < 0 || element.iptcType.getType() > 0xff) {
434                     throw new ImagingException("Invalid record type: " + element.iptcType.getType());
435                 }
436                 bos.write(element.iptcType.getType());
437 
438                 final byte[] recordData = element.getValue().getBytes(charset);
439                 /*
440                  * if (!new String(recordData, charset).equals(element.getValue())) { throw new ImageWriteException( "Invalid record value, not " +
441                  * charset.name()); }
442                  */
443 
444                 bos.write2Bytes(recordData.length);
445                 bos.write(recordData);
446             }
447         }
448 
449         return baos.toByteArray();
450     }
451 
452     public byte[] writePhotoshopApp13Segment(final PhotoshopApp13Data data) throws IOException, ImagingException {
453         try (ByteArrayOutputStream os = new ByteArrayOutputStream();
454                 AbstractBinaryOutputStream bos = AbstractBinaryOutputStream.bigEndian(os)) {
455 
456             JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.writeTo(bos);
457 
458             final List<IptcBlock> blocks = data.getRawBlocks();
459             for (final IptcBlock block : blocks) {
460                 bos.write4Bytes(JpegConstants.CONST_8BIM);
461 
462                 if (block.getBlockType() < 0 || block.getBlockType() > 0xffff) {
463                     throw new ImagingException("Invalid IPTC block type.");
464                 }
465                 bos.write2Bytes(block.getBlockType());
466 
467                 final byte[] blockNameBytes = block.getBlockNameBytes();
468                 if (blockNameBytes.length > 255) {
469                     throw new ImagingException("IPTC block name is too long: " + blockNameBytes.length);
470                 }
471                 bos.write(blockNameBytes.length);
472                 bos.write(blockNameBytes);
473                 if (blockNameBytes.length % 2 == 0) {
474                     bos.write(0); // pad to even size, including length byte.
475                 }
476 
477                 final byte[] blockData = block.getBlockData();
478                 if (blockData.length > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE) {
479                     throw new ImagingException("IPTC block data is too long: " + blockData.length);
480                 }
481                 bos.write4Bytes(blockData.length);
482                 bos.write(blockData);
483                 if (blockData.length % 2 == 1) {
484                     bos.write(0); // pad to even size
485                 }
486             }
487 
488             bos.flush();
489             return os.toByteArray();
490         }
491     }
492 
493 }