1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.text;
18
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.List;
23 import java.util.ListIterator;
24 import java.util.NoSuchElementException;
25
26 import org.apache.commons.lang3.ArrayUtils;
27 import org.apache.commons.lang3.StringUtils;
28 import org.apache.commons.text.matcher.StringMatcher;
29 import org.apache.commons.text.matcher.StringMatcherFactory;
30
31 /**
32 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts.
33 * <p>
34 * This class can split a String into many smaller strings. It aims to do a similar job to
35 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including
36 * implementing the {@code ListIterator} interface. By default, it is set up like {@code StringTokenizer}.
37 * <p>
38 * The input String is split into a number of <em>tokens</em>. Each token is separated from the next String by a
39 * <em>delimiter</em>. One or more delimiter characters must be specified.
40 * <p>
41 * Each token may be surrounded by quotes. The <em>quote</em> matcher specifies the quote character(s). A quote may be
42 * escaped within a quoted section by duplicating itself.
43 * <p>
44 * Between each token and the delimiter are potentially characters that need trimming. The <em>trimmer</em> matcher
45 * specifies these characters. One usage might be to trim whitespace characters.
46 * <p>
47 * At any point outside the quotes there might potentially be invalid characters. The <em>ignored</em> matcher specifies
48 * these characters to be removed. One usage might be to remove new line characters.
49 * <p>
50 * Empty tokens may be removed or returned as null.
51 *
52 * <pre>
53 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
54 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
55 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
56 * </pre>
57 *
58 * <table>
59 * <caption>StringTokenizer properties and options</caption>
60 * <tr>
61 * <th>Property</th>
62 * <th>Type</th>
63 * <th>Default</th>
64 * </tr>
65 * <tr>
66 * <td>delim</td>
67 * <td>CharSetMatcher</td>
68 * <td>{ \t\n\r\f}</td>
69 * </tr>
70 * <tr>
71 * <td>quote</td>
72 * <td>NoneMatcher</td>
73 * <td>{}</td>
74 * </tr>
75 * <tr>
76 * <td>ignore</td>
77 * <td>NoneMatcher</td>
78 * <td>{}</td>
79 * </tr>
80 * <tr>
81 * <td>emptyTokenAsNull</td>
82 * <td>boolean</td>
83 * <td>false</td>
84 * </tr>
85 * <tr>
86 * <td>ignoreEmptyTokens</td>
87 * <td>boolean</td>
88 * <td>true</td>
89 * </tr>
90 * </table>
91 *
92 * @since 1.3
93 */
94 public class StringTokenizer implements ListIterator<String>, Cloneable {
95
96 /** Comma separated values tokenizer internal variable. */
97 // @formatter:off
98 private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
99 .setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher())
100 .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
101 .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
102 .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
103 .setEmptyTokenAsNull(false)
104 .setIgnoreEmptyTokens(false);
105 // @formatter:on
106
107 /** Tab separated values tokenizer internal variable. */
108 // @formatter:off
109 private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE = new StringTokenizer()
110 .setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher())
111 .setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher())
112 .setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher())
113 .setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher())
114 .setEmptyTokenAsNull(false)
115 .setIgnoreEmptyTokens(false);
116 // @formatter:on
117
118 /**
119 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
120 *
121 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
122 */
123 private static StringTokenizer getCSVClone() {
124 return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
125 }
126
127 /**
128 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
129 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
130 * setTrimmer method).
131 * <p>
132 * You must call a "reset" method to set the string which you want to parse.
133 * </p>
134 *
135 * @return a new tokenizer instance which parses Comma Separated Value strings
136 */
137 public static StringTokenizer getCSVInstance() {
138 return getCSVClone();
139 }
140
141 /**
142 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
143 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
144 * setTrimmer method).
145 *
146 * @param input
147 * the text to parse
148 * @return a new tokenizer instance which parses Comma Separated Value strings
149 */
150 public static StringTokenizer getCSVInstance(final char[] input) {
151 return getCSVClone().reset(input);
152 }
153
154 /**
155 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input.
156 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the
157 * setTrimmer method).
158 *
159 * @param input
160 * the text to parse
161 * @return a new tokenizer instance which parses Comma Separated Value strings
162 */
163 public static StringTokenizer getCSVInstance(final String input) {
164 return getCSVClone().reset(input);
165 }
166
167 /**
168 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
169 *
170 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171 */
172 private static StringTokenizer getTSVClone() {
173 return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
174 }
175
176 /**
177 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
178 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
179 * <p>
180 * You must call a "reset" method to set the string which you want to parse.
181 * </p>
182 *
183 * @return a new tokenizer instance which parses Tab Separated Value strings.
184 */
185 public static StringTokenizer getTSVInstance() {
186 return getTSVClone();
187 }
188
189 /**
190 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
191 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
192 *
193 * @param input
194 * the string to parse
195 * @return a new tokenizer instance which parses Tab Separated Value strings.
196 */
197 public static StringTokenizer getTSVInstance(final char[] input) {
198 return getTSVClone().reset(input);
199 }
200
201 /**
202 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be
203 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
204 *
205 * @param input
206 * the string to parse
207 * @return a new tokenizer instance which parses Tab Separated Value strings.
208 */
209 public static StringTokenizer getTSVInstance(final String input) {
210 return getTSVClone().reset(input);
211 }
212
213 /** The text to work on. */
214 private char[] chars;
215
216 /** The parsed tokens. */
217 private String[] tokens;
218
219 /** The current iteration position. */
220 private int tokenPos;
221
222 /** The delimiter matcher. */
223 private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher();
224
225 /** The quote matcher. */
226 private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
227
228 /** The ignored matcher. */
229 private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
230
231 /** The trimmer matcher. */
232 private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher();
233
234 /** Whether to return empty tokens as null. */
235 private boolean emptyAsNull;
236
237 /** Whether to ignore empty tokens. */
238 private boolean ignoreEmptyTokens = true;
239
240 /**
241 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to
242 * tokenize.
243 * <p>
244 * This constructor is normally used with {@link #reset(String)}.
245 * </p>
246 */
247 public StringTokenizer() {
248 this.chars = null;
249 }
250
251 /**
252 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
253 *
254 * @param input
255 * the string which is to be parsed, not cloned
256 */
257 public StringTokenizer(final char[] input) {
258 this.chars = input != null ? input.clone() : null;
259 }
260
261 /**
262 * Constructs a tokenizer splitting on the specified character.
263 *
264 * @param input
265 * the string which is to be parsed, not cloned
266 * @param delim
267 * the field delimiter character
268 */
269 public StringTokenizer(final char[] input, final char delim) {
270 this(input);
271 setDelimiterChar(delim);
272 }
273
274 /**
275 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
276 * quote character.
277 *
278 * @param input
279 * the string which is to be parsed, not cloned
280 * @param delim
281 * the field delimiter character
282 * @param quote
283 * the field quoted string character
284 */
285 public StringTokenizer(final char[] input, final char delim, final char quote) {
286 this(input, delim);
287 setQuoteChar(quote);
288 }
289
290 /**
291 * Constructs a tokenizer splitting on the specified string.
292 *
293 * @param input
294 * the string which is to be parsed, not cloned
295 * @param delim
296 * the field delimiter string
297 */
298 public StringTokenizer(final char[] input, final String delim) {
299 this(input);
300 setDelimiterString(delim);
301 }
302
303 /**
304 * Constructs a tokenizer splitting using the specified delimiter matcher.
305 *
306 * @param input
307 * the string which is to be parsed, not cloned
308 * @param delim
309 * the field delimiter matcher
310 */
311 public StringTokenizer(final char[] input, final StringMatcher delim) {
312 this(input);
313 setDelimiterMatcher(delim);
314 }
315
316 /**
317 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
318 * quote matcher.
319 *
320 * @param input
321 * the string which is to be parsed, not cloned
322 * @param delim
323 * the field delimiter character
324 * @param quote
325 * the field quoted string character
326 */
327 public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) {
328 this(input, delim);
329 setQuoteMatcher(quote);
330 }
331
332 /**
333 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer.
334 *
335 * @param input
336 * the string which is to be parsed
337 */
338 public StringTokenizer(final String input) {
339 this.chars = input != null ? input.toCharArray() : null;
340 }
341
342 /**
343 * Constructs a tokenizer splitting on the specified delimiter character.
344 *
345 * @param input
346 * the string which is to be parsed
347 * @param delim
348 * the field delimiter character
349 */
350 public StringTokenizer(final String input, final char delim) {
351 this(input);
352 setDelimiterChar(delim);
353 }
354
355 /**
356 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified
357 * quote character.
358 *
359 * @param input
360 * the string which is to be parsed
361 * @param delim
362 * the field delimiter character
363 * @param quote
364 * the field quoted string character
365 */
366 public StringTokenizer(final String input, final char delim, final char quote) {
367 this(input, delim);
368 setQuoteChar(quote);
369 }
370
371 /**
372 * Constructs a tokenizer splitting on the specified delimiter string.
373 *
374 * @param input
375 * the string which is to be parsed
376 * @param delim
377 * the field delimiter string
378 */
379 public StringTokenizer(final String input, final String delim) {
380 this(input);
381 setDelimiterString(delim);
382 }
383
384 /**
385 * Constructs a tokenizer splitting using the specified delimiter matcher.
386 *
387 * @param input
388 * the string which is to be parsed
389 * @param delim
390 * the field delimiter matcher
391 */
392 public StringTokenizer(final String input, final StringMatcher delim) {
393 this(input);
394 setDelimiterMatcher(delim);
395 }
396
397 /**
398 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified
399 * quote matcher.
400 *
401 * @param input
402 * the string which is to be parsed
403 * @param delim
404 * the field delimiter matcher
405 * @param quote
406 * the field quoted string matcher
407 */
408 public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) {
409 this(input, delim);
410 setQuoteMatcher(quote);
411 }
412
413 /**
414 * Unsupported ListIterator operation.
415 *
416 * @param obj
417 * this parameter ignored.
418 * @throws UnsupportedOperationException
419 * always
420 */
421 @Override
422 public void add(final String obj) {
423 throw new UnsupportedOperationException("add() is unsupported");
424 }
425
426 /**
427 * Adds a token to a list, paying attention to the parameters we've set.
428 *
429 * @param list
430 * the list to add to
431 * @param tok
432 * the token to add
433 */
434 private void addToken(final List<String> list, String tok) {
435 if (tok == null || tok.isEmpty()) {
436 if (isIgnoreEmptyTokens()) {
437 return;
438 }
439 if (isEmptyTokenAsNull()) {
440 tok = null;
441 }
442 }
443 list.add(tok);
444 }
445
446 /**
447 * Checks if tokenization has been done, and if not then do it.
448 */
449 private void checkTokenized() {
450 if (tokens == null) {
451 final List<String> split;
452 if (chars == null) {
453 // still call tokenize as subclass may do some work
454 split = tokenize(null, 0, 0);
455 } else {
456 split = tokenize(chars, 0, chars.length);
457 }
458 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
459 }
460 }
461
462 /**
463 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
464 * list. If a {@link CloneNotSupportedException} is caught, return {@code null}.
465 *
466 * @return a new instance of this Tokenizer which has been reset.
467 */
468 @Override
469 public Object clone() {
470 try {
471 return cloneReset();
472 } catch (final CloneNotSupportedException ex) {
473 return null;
474 }
475 }
476
477 /**
478 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token
479 * list.
480 *
481 * @return a new instance of this Tokenizer which has been reset.
482 * @throws CloneNotSupportedException
483 * if there is a problem cloning
484 */
485 Object cloneReset() throws CloneNotSupportedException {
486 // this method exists to enable 100% test coverage
487 final StringTokenizer cloned = (StringTokenizer) super.clone();
488 if (cloned.chars != null) {
489 cloned.chars = cloned.chars.clone();
490 }
491 cloned.reset();
492 return cloned;
493 }
494
495 /**
496 * Gets the String content that the tokenizer is parsing.
497 *
498 * @return The string content being parsed
499 */
500 public String getContent() {
501 if (chars == null) {
502 return null;
503 }
504 return new String(chars);
505 }
506
507 /**
508 * Gets the field delimiter matcher.
509 *
510 * @return The delimiter matcher in use
511 */
512 public StringMatcher getDelimiterMatcher() {
513 return this.delimMatcher;
514 }
515
516 /**
517 * Gets the ignored character matcher.
518 * <p>
519 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value
520 * is not to ignore anything.
521 * </p>
522 *
523 * @return The ignored matcher in use
524 */
525 public StringMatcher getIgnoredMatcher() {
526 return ignoredMatcher;
527 }
528
529 /**
530 * Gets the quote matcher currently in use.
531 * <p>
532 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The
533 * default value is '"' (double quote).
534 * </p>
535 *
536 * @return The quote matcher in use
537 */
538 public StringMatcher getQuoteMatcher() {
539 return quoteMatcher;
540 }
541
542 /**
543 * Gets a copy of the full token list as an independent modifiable array.
544 *
545 * @return The tokens as a String array
546 */
547 public String[] getTokenArray() {
548 checkTokenized();
549 return tokens.clone();
550 }
551
552 /**
553 * Gets a copy of the full token list as an independent modifiable list.
554 *
555 * @return The tokens as a String list
556 */
557 public List<String> getTokenList() {
558 checkTokenized();
559 return new ArrayList<>(Arrays.asList(tokens));
560 }
561
562 /**
563 * Gets the trimmer character matcher.
564 * <p>
565 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default
566 * value is not to trim anything.
567 * </p>
568 *
569 * @return The trimmer matcher in use
570 */
571 public StringMatcher getTrimmerMatcher() {
572 return trimmerMatcher;
573 }
574
575 /**
576 * Tests whether there are any more tokens.
577 *
578 * @return true if there are more tokens
579 */
580 @Override
581 public boolean hasNext() {
582 checkTokenized();
583 return tokenPos < tokens.length;
584 }
585
586 /**
587 * Tests whether there are any previous tokens that can be iterated to.
588 *
589 * @return true if there are previous tokens
590 */
591 @Override
592 public boolean hasPrevious() {
593 checkTokenized();
594 return tokenPos > 0;
595 }
596
597 /**
598 * Tests whether the tokenizer currently returns empty tokens as null. The default for this property is false.
599 *
600 * @return true if empty tokens are returned as null
601 */
602 public boolean isEmptyTokenAsNull() {
603 return this.emptyAsNull;
604 }
605
606 /**
607 * Tests whether the tokenizer currently ignores empty tokens. The default for this property is true.
608 *
609 * @return true if empty tokens are not returned
610 */
611 public boolean isIgnoreEmptyTokens() {
612 return ignoreEmptyTokens;
613 }
614
615 /**
616 * Tests if the characters at the index specified match the quote already matched in readNextToken().
617 *
618 * @param srcChars
619 * the character array being tokenized
620 * @param pos
621 * the position to check for a quote
622 * @param len
623 * the length of the character array being tokenized
624 * @param quoteStart
625 * the start position of the matched quote, 0 if no quoting
626 * @param quoteLen
627 * the length of the matched quote, 0 if no quoting
628 * @return true if a quote is matched
629 */
630 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart,
631 final int quoteLen) {
632 for (int i = 0; i < quoteLen; i++) {
633 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
634 return false;
635 }
636 }
637 return true;
638 }
639
640 /**
641 * Gets the next token.
642 *
643 * @return The next String token
644 * @throws NoSuchElementException
645 * if there are no more elements
646 */
647 @Override
648 public String next() {
649 if (hasNext()) {
650 return tokens[tokenPos++];
651 }
652 throw new NoSuchElementException();
653 }
654
655 /**
656 * Gets the index of the next token to return.
657 *
658 * @return The next token index
659 */
660 @Override
661 public int nextIndex() {
662 return tokenPos;
663 }
664
665 /**
666 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing
667 * {@link NoSuchElementException} when no tokens remain.
668 *
669 * @return The next sequential token, or null when no more tokens are found
670 */
671 public String nextToken() {
672 if (hasNext()) {
673 return tokens[tokenPos++];
674 }
675 return null;
676 }
677
678 /**
679 * Gets the token previous to the last returned token.
680 *
681 * @return The previous token
682 */
683 @Override
684 public String previous() {
685 if (hasPrevious()) {
686 return tokens[--tokenPos];
687 }
688 throw new NoSuchElementException();
689 }
690
691 /**
692 * Gets the index of the previous token.
693 *
694 * @return The previous token index
695 */
696 @Override
697 public int previousIndex() {
698 return tokenPos - 1;
699 }
700
701 /**
702 * Gets the previous token from the String.
703 *
704 * @return The previous sequential token, or null when no more tokens are found
705 */
706 public String previousToken() {
707 if (hasPrevious()) {
708 return tokens[--tokenPos];
709 }
710 return null;
711 }
712
713 /**
714 * Reads character by character through the String to get the next token.
715 *
716 * @param srcChars
717 * the character array being tokenized
718 * @param start
719 * the first character of field
720 * @param len
721 * the length of the character array being tokenized
722 * @param workArea
723 * a temporary work area
724 * @param tokenList
725 * the list of parsed tokens
726 * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of
727 * string found
728 */
729 private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea,
730 final List<String> tokenList) {
731 // skip all leading whitespace, unless it is the
732 // field delimiter or the quote character
733 while (start < len) {
734 final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len),
735 getTrimmerMatcher().isMatch(srcChars, start, start, len));
736 if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
737 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
738 break;
739 }
740 start += removeLen;
741 }
742
743 // handle reaching end
744 if (start >= len) {
745 addToken(tokenList, StringUtils.EMPTY);
746 return -1;
747 }
748
749 // handle empty token
750 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
751 if (delimLen > 0) {
752 addToken(tokenList, StringUtils.EMPTY);
753 return start + delimLen;
754 }
755
756 // handle found token
757 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
758 if (quoteLen > 0) {
759 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
760 }
761 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
762 }
763
764 /**
765 * Reads a possibly quoted string token.
766 *
767 * @param srcChars
768 * the character array being tokenized
769 * @param start
770 * the first character of field
771 * @param len
772 * the length of the character array being tokenized
773 * @param workArea
774 * a temporary work area
775 * @param tokenList
776 * the list of parsed tokens
777 * @param quoteStart
778 * the start position of the matched quote, 0 if no quoting
779 * @param quoteLen
780 * the length of the matched quote, 0 if no quoting
781 * @return The starting position of the next field (the character immediately after the delimiter, or if end of
782 * string found, then the length of string
783 */
784 private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea,
785 final List<String> tokenList, final int quoteStart, final int quoteLen) {
786 // Loop until we've found the end of the quoted
787 // string or the end of the input
788 workArea.clear();
789 int pos = start;
790 boolean quoting = quoteLen > 0;
791 int trimStart = 0;
792
793 while (pos < len) {
794 // quoting mode can occur several times throughout a string
795 // we must switch between quoting and non-quoting until we
796 // encounter a non-quoted delimiter, or end of string
797 if (quoting) {
798 // In quoting mode
799
800 // If we've found a quote character, see if it's
801 // followed by a second quote. If so, then we need
802 // to actually put the quote character into the token
803 // rather than end the token.
804 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
805 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
806 // matched pair of quotes, thus an escaped quote
807 workArea.append(srcChars, pos, quoteLen);
808 pos += quoteLen * 2;
809 trimStart = workArea.size();
810 continue;
811 }
812
813 // end of quoting
814 quoting = false;
815 pos += quoteLen;
816 continue;
817 }
818
819 } else {
820 // Not in quoting mode
821
822 // check for delimiter, and thus end of token
823 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
824 if (delimLen > 0) {
825 // return condition when end of token found
826 addToken(tokenList, workArea.substring(0, trimStart));
827 return pos + delimLen;
828 }
829
830 // check for quote, and thus back into quoting mode
831 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
832 quoting = true;
833 pos += quoteLen;
834 continue;
835 }
836
837 // check for ignored (outside quotes), and ignore
838 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
839 if (ignoredLen > 0) {
840 pos += ignoredLen;
841 continue;
842 }
843
844 // check for trimmed character
845 // don't yet know if its at the end, so copy to workArea
846 // use trimStart to keep track of trim at the end
847 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
848 if (trimmedLen > 0) {
849 workArea.append(srcChars, pos, trimmedLen);
850 pos += trimmedLen;
851 continue;
852 }
853 }
854 // copy regular character from inside quotes
855 workArea.append(srcChars[pos++]);
856 trimStart = workArea.size();
857 }
858
859 // return condition when end of string found
860 addToken(tokenList, workArea.substring(0, trimStart));
861 return -1;
862 }
863
864 /**
865 * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
866 *
867 * @throws UnsupportedOperationException
868 * always
869 */
870 @Override
871 public void remove() {
872 throw new UnsupportedOperationException("remove() is unsupported");
873 }
874
875 /**
876 * Resets this tokenizer, forgetting all parsing and iteration already completed.
877 * <p>
878 * This method allows the same tokenizer to be reused for the same String.
879 * </p>
880 *
881 * @return this, to enable chaining
882 */
883 public StringTokenizer reset() {
884 tokenPos = 0;
885 tokens = null;
886 return this;
887 }
888
889 /**
890 * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
891 * same settings on multiple input lines.
892 *
893 * @param input
894 * the new character array to tokenize, not cloned, null sets no text to parse
895 * @return this, to enable chaining
896 */
897 public StringTokenizer reset(final char[] input) {
898 reset();
899 this.chars = input != null ? input.clone() : null;
900 return this;
901 }
902
903 /**
904 * Resets this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the
905 * same settings on multiple input lines.
906 *
907 * @param input
908 * the new string to tokenize, null sets no text to parse
909 * @return this, to enable chaining
910 */
911 public StringTokenizer reset(final String input) {
912 reset();
913 this.chars = input != null ? input.toCharArray() : null;
914 return this;
915 }
916
917 /**
918 * Throws {@link UnsupportedOperationException} for this unsupported ListIterator operation.
919 *
920 * @param obj
921 * this parameter ignored.
922 * @throws UnsupportedOperationException
923 * always
924 */
925 @Override
926 public void set(final String obj) {
927 throw new UnsupportedOperationException("set() is unsupported");
928 }
929
930 /**
931 * Sets the field delimiter character.
932 *
933 * @param delim
934 * the delimiter character to use
935 * @return this, to enable chaining
936 */
937 public StringTokenizer setDelimiterChar(final char delim) {
938 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim));
939 }
940
941 /**
942 * Sets the field delimiter matcher.
943 * <p>
944 * The delimiter is used to separate one token from another.
945 * </p>
946 *
947 * @param delim
948 * the delimiter matcher to use
949 * @return this, to enable chaining
950 */
951 public StringTokenizer setDelimiterMatcher(final StringMatcher delim) {
952 this.delimMatcher = delim == null ? StringMatcherFactory.INSTANCE.noneMatcher() : delim;
953 return this;
954 }
955
956 /**
957 * Sets the field delimiter string.
958 *
959 * @param delim
960 * the delimiter string to use
961 * @return this, to enable chaining
962 */
963 public StringTokenizer setDelimiterString(final String delim) {
964 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim));
965 }
966
967 /**
968 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
969 *
970 * @param emptyAsNull
971 * whether empty tokens are returned as null
972 * @return this, to enable chaining
973 */
974 public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
975 this.emptyAsNull = emptyAsNull;
976 return this;
977 }
978
979 /**
980 * Sets the character to ignore.
981 * <p>
982 * This character is ignored when parsing the String, unless it is within a quoted region.
983 * </p>
984 *
985 * @param ignored
986 * the ignored character to use
987 * @return this, to enable chaining
988 */
989 public StringTokenizer setIgnoredChar(final char ignored) {
990 return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored));
991 }
992
993 /**
994 * Sets the matcher for characters to ignore.
995 * <p>
996 * These characters are ignored when parsing the String, unless they are within a quoted region.
997 * </p>
998 *
999 * @param ignored
1000 * the ignored matcher to use, null ignored
1001 * @return this, to enable chaining
1002 */
1003 public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) {
1004 if (ignored != null) {
1005 this.ignoredMatcher = ignored;
1006 }
1007 return this;
1008 }
1009
1010 /**
1011 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
1012 *
1013 * @param ignoreEmptyTokens
1014 * whether empty tokens are not returned
1015 * @return this, to enable chaining
1016 */
1017 public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1018 this.ignoreEmptyTokens = ignoreEmptyTokens;
1019 return this;
1020 }
1021
1022 /**
1023 * Sets the quote character to use.
1024 * <p>
1025 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1026 * </p>
1027 *
1028 * @param quote
1029 * the quote character to use
1030 * @return this, to enable chaining
1031 */
1032 public StringTokenizer setQuoteChar(final char quote) {
1033 return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote));
1034 }
1035
1036 /**
1037 * Sets the quote matcher to use.
1038 * <p>
1039 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1040 * </p>
1041 *
1042 * @param quote
1043 * the quote matcher to use, null ignored
1044 * @return this, to enable chaining
1045 */
1046 public StringTokenizer setQuoteMatcher(final StringMatcher quote) {
1047 if (quote != null) {
1048 this.quoteMatcher = quote;
1049 }
1050 return this;
1051 }
1052
1053 /**
1054 * Sets the matcher for characters to trim.
1055 * <p>
1056 * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1057 *
1058 * @param trimmer
1059 * the trimmer matcher to use, null ignored
1060 * @return this, to enable chaining
1061 */
1062 public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) {
1063 if (trimmer != null) {
1064 this.trimmerMatcher = trimmer;
1065 }
1066 return this;
1067 }
1068
1069 /**
1070 * Gets the number of tokens found in the String.
1071 *
1072 * @return The number of matched tokens
1073 */
1074 public int size() {
1075 checkTokenized();
1076 return tokens.length;
1077 }
1078
1079 /**
1080 * Internal method to performs the tokenization.
1081 * <p>
1082 * Most users of this class do not need to call this method. This method will be called automatically by other
1083 * (public) methods when required.
1084 * </p>
1085 * <p>
1086 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass
1087 * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple
1088 * strings. It is also be possible to filter the results.
1089 * </p>
1090 * <p>
1091 * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this
1092 * method, however a subclass may pass other values, or even an entirely different array.
1093 * </p>
1094 *
1095 * @param srcChars
1096 * the character array being tokenized, may be null
1097 * @param offset
1098 * the start position within the character array, must be valid
1099 * @param count
1100 * the number of characters to tokenize, must be valid
1101 * @return The modifiable list of String tokens, unmodifiable if null array or zero count
1102 */
1103 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1104 if (srcChars == null || count == 0) {
1105 return Collections.emptyList();
1106 }
1107 final TextStringBuilder buf = new TextStringBuilder();
1108 final List<String> tokenList = new ArrayList<>();
1109 int pos = offset;
1110
1111 // loop around the entire buffer
1112 while (pos >= 0 && pos < count) {
1113 // find next token
1114 pos = readNextToken(srcChars, pos, count, buf, tokenList);
1115
1116 // handle case where end of string is a delimiter
1117 if (pos >= count) {
1118 addToken(tokenList, StringUtils.EMPTY);
1119 }
1120 }
1121 return tokenList;
1122 }
1123
1124 /**
1125 * Gets the String content that the tokenizer is parsing.
1126 *
1127 * @return The string content being parsed
1128 */
1129 @Override
1130 public String toString() {
1131 if (tokens == null) {
1132 return "StringTokenizer[not tokenized yet]";
1133 }
1134 return "StringTokenizer" + getTokenList();
1135 }
1136
1137 }