1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.text;
18
19 import java.util.ArrayList;
20 import java.util.Collections;
21 import java.util.List;
22 import java.util.ListIterator;
23 import java.util.NoSuchElementException;
24
25 import org.apache.commons.lang3.ArrayUtils;
26 import org.apache.commons.lang3.StringUtils;
27
28 /**
29 * Tokenizes a string based on delimiters (separators)
30 * and supporting quoting and ignored character concepts.
31 * <p>
32 * This class can split a String into many smaller strings. It aims
33 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
34 * however it offers much more control and flexibility including implementing
35 * the {@code ListIterator} interface. By default, it is set up
36 * like {@code StringTokenizer}.
37 * <p>
38 * The input String is split into a number of <em>tokens</em>.
39 * Each token is separated from the next String by a <em>delimiter</em>.
40 * One or more delimiter characters must be specified.
41 * <p>
42 * Each token may be surrounded by quotes.
43 * The <em>quote</em> matcher specifies the quote character(s).
44 * A quote may be escaped within a quoted section by duplicating itself.
45 * <p>
46 * Between each token and the delimiter are potentially characters that need trimming.
47 * The <em>trimmer</em> matcher specifies these characters.
48 * One usage might be to trim whitespace characters.
49 * <p>
50 * At any point outside the quotes there might potentially be invalid characters.
51 * The <em>ignored</em> matcher specifies these characters to be removed.
52 * One usage might be to remove new line characters.
53 * <p>
54 * Empty tokens may be removed or returned as null.
55 * <pre>
56 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
57 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
58 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
59 * </pre>
60 *
61 * <table>
62 * <caption>StrTokenizer properties and options</caption>
63 * <tr>
64 * <th>Property</th><th>Type</th><th>Default</th>
65 * </tr>
66 * <tr>
67 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
68 * </tr>
69 * <tr>
70 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
71 * </tr>
72 * <tr>
73 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
74 * </tr>
75 * <tr>
76 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
77 * </tr>
78 * <tr>
79 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
80 * </tr>
81 * </table>
82 *
83 * @since 1.0
84 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
85 */
86 @Deprecated
87 public class StrTokenizer implements ListIterator<String>, Cloneable {
88
89 /** Comma separated values tokenizer internal variable. */
90 // @formatter:off
91 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
92 .setDelimiterMatcher(StrMatcher.commaMatcher())
93 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
94 .setIgnoredMatcher(StrMatcher.noneMatcher())
95 .setTrimmerMatcher(StrMatcher.trimMatcher())
96 .setEmptyTokenAsNull(false)
97 .setIgnoreEmptyTokens(false);
98 // @formatter:on
99
100 /** Tab separated values tokenizer internal variable. */
101 // @formatter:off
102 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
103 .setDelimiterMatcher(StrMatcher.tabMatcher())
104 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
105 .setIgnoredMatcher(StrMatcher.noneMatcher())
106 .setTrimmerMatcher(StrMatcher.trimMatcher())
107 .setEmptyTokenAsNull(false)
108 .setIgnoreEmptyTokens(false);
109 // @formatter:on
110
111 /**
112 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
113 *
114 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
115 */
116 private static StrTokenizer getCSVClone() {
117 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
118 }
119
120 /**
121 * Gets a new tokenizer instance which parses Comma Separated Value strings
122 * initializing it with the given input. The default for CSV processing
123 * will be trim whitespace from both ends (which can be overridden with
124 * the setTrimmer method).
125 * <p>
126 * You must call a "reset" method to set the string which you want to parse.
127 * </p>
128 * @return a new tokenizer instance which parses Comma Separated Value strings
129 */
130 public static StrTokenizer getCSVInstance() {
131 return getCSVClone();
132 }
133
134 /**
135 * Gets a new tokenizer instance which parses Comma Separated Value strings
136 * initializing it with the given input. The default for CSV processing
137 * will be trim whitespace from both ends (which can be overridden with
138 * the setTrimmer method).
139 *
140 * @param input the text to parse
141 * @return a new tokenizer instance which parses Comma Separated Value strings
142 */
143 public static StrTokenizer getCSVInstance(final char[] input) {
144 final StrTokenizer tok = getCSVClone();
145 tok.reset(input);
146 return tok;
147 }
148
149 /**
150 * Gets a new tokenizer instance which parses Comma Separated Value strings
151 * initializing it with the given input. The default for CSV processing
152 * will be trim whitespace from both ends (which can be overridden with
153 * the setTrimmer method).
154 *
155 * @param input the text to parse
156 * @return a new tokenizer instance which parses Comma Separated Value strings
157 */
158 public static StrTokenizer getCSVInstance(final String input) {
159 final StrTokenizer tok = getCSVClone();
160 tok.reset(input);
161 return tok;
162 }
163 /**
164 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
165 *
166 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
167 */
168 private static StrTokenizer getTSVClone() {
169 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
170 }
171
172 /**
173 * Gets a new tokenizer instance which parses Tab Separated Value strings.
174 * The default for CSV processing will be trim whitespace from both ends
175 * (which can be overridden with the setTrimmer method).
176 * <p>
177 * You must call a "reset" method to set the string which you want to parse.
178 * </p>
179 * @return a new tokenizer instance which parses Tab Separated Value strings.
180 */
181 public static StrTokenizer getTSVInstance() {
182 return getTSVClone();
183 }
184
185 /**
186 * Gets a new tokenizer instance which parses Tab Separated Value strings.
187 * The default for CSV processing will be trim whitespace from both ends
188 * (which can be overridden with the setTrimmer method).
189 * @param input the string to parse
190 * @return a new tokenizer instance which parses Tab Separated Value strings.
191 */
192 public static StrTokenizer getTSVInstance(final char[] input) {
193 final StrTokenizer tok = getTSVClone();
194 tok.reset(input);
195 return tok;
196 }
197
198 /**
199 * Gets a new tokenizer instance which parses Tab Separated Value strings.
200 * The default for CSV processing will be trim whitespace from both ends
201 * (which can be overridden with the setTrimmer method).
202 * @param input the string to parse
203 * @return a new tokenizer instance which parses Tab Separated Value strings.
204 */
205 public static StrTokenizer getTSVInstance(final String input) {
206 final StrTokenizer tok = getTSVClone();
207 tok.reset(input);
208 return tok;
209 }
210
211 /** The text to work on. */
212 private char[] chars;
213
214 /** The parsed tokens. */
215 private String[] tokens;
216
217 /** The current iteration position. */
218 private int tokenPos;
219
220 /** The delimiter matcher. */
221 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
222
223 /** The quote matcher. */
224 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
225
226 /** The ignored matcher. */
227 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
228
229 /** The trimmer matcher. */
230 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
231
232 /** Whether to return empty tokens as null. */
233 private boolean emptyAsNull;
234
235 /** Whether to ignore empty tokens. */
236 private boolean ignoreEmptyTokens = true;
237
238 /**
239 * Constructs a tokenizer splitting on space, tab, newline and form feed
240 * as per StringTokenizer, but with no text to tokenize.
241 * <p>
242 * This constructor is normally used with {@link #reset(String)}.
243 * </p>
244 */
245 public StrTokenizer() {
246 this.chars = null;
247 }
248
249 /**
250 * Constructs a tokenizer splitting on space, tab, newline and form feed
251 * as per StringTokenizer.
252 *
253 * @param input the string which is to be parsed, not cloned
254 */
255 public StrTokenizer(final char[] input) {
256 if (input == null) {
257 this.chars = null;
258 } else {
259 this.chars = input.clone();
260 }
261 }
262
263 /**
264 * Constructs a tokenizer splitting on the specified character.
265 *
266 * @param input the string which is to be parsed, not cloned
267 * @param delim the field delimiter character
268 */
269 public StrTokenizer(final char[] input, final char delim) {
270 this(input);
271 setDelimiterChar(delim);
272 }
273
274 /**
275 * Constructs a tokenizer splitting on the specified delimiter character
276 * and handling quotes using the specified quote character.
277 *
278 * @param input the string which is to be parsed, not cloned
279 * @param delim the field delimiter character
280 * @param quote the field quoted string character
281 */
282 public StrTokenizer(final char[] input, final char delim, final char quote) {
283 this(input, delim);
284 setQuoteChar(quote);
285 }
286
287 /**
288 * Constructs a tokenizer splitting on the specified string.
289 *
290 * @param input the string which is to be parsed, not cloned
291 * @param delim the field delimiter string
292 */
293 public StrTokenizer(final char[] input, final String delim) {
294 this(input);
295 setDelimiterString(delim);
296 }
297
298 /**
299 * Constructs a tokenizer splitting using the specified delimiter matcher.
300 *
301 * @param input the string which is to be parsed, not cloned
302 * @param delim the field delimiter matcher
303 */
304 public StrTokenizer(final char[] input, final StrMatcher delim) {
305 this(input);
306 setDelimiterMatcher(delim);
307 }
308
309 /**
310 * Constructs a tokenizer splitting using the specified delimiter matcher
311 * and handling quotes using the specified quote matcher.
312 *
313 * @param input the string which is to be parsed, not cloned
314 * @param delim the field delimiter character
315 * @param quote the field quoted string character
316 */
317 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
318 this(input, delim);
319 setQuoteMatcher(quote);
320 }
321
322 /**
323 * Constructs a tokenizer splitting on space, tab, newline and form feed
324 * as per StringTokenizer.
325 *
326 * @param input the string which is to be parsed
327 */
328 public StrTokenizer(final String input) {
329 if (input != null) {
330 chars = input.toCharArray();
331 } else {
332 chars = null;
333 }
334 }
335
336 /**
337 * Constructs a tokenizer splitting on the specified delimiter character.
338 *
339 * @param input the string which is to be parsed
340 * @param delim the field delimiter character
341 */
342 public StrTokenizer(final String input, final char delim) {
343 this(input);
344 setDelimiterChar(delim);
345 }
346
347 /**
348 * Constructs a tokenizer splitting on the specified delimiter character
349 * and handling quotes using the specified quote character.
350 *
351 * @param input the string which is to be parsed
352 * @param delim the field delimiter character
353 * @param quote the field quoted string character
354 */
355 public StrTokenizer(final String input, final char delim, final char quote) {
356 this(input, delim);
357 setQuoteChar(quote);
358 }
359
360 /**
361 * Constructs a tokenizer splitting on the specified delimiter string.
362 *
363 * @param input the string which is to be parsed
364 * @param delim the field delimiter string
365 */
366 public StrTokenizer(final String input, final String delim) {
367 this(input);
368 setDelimiterString(delim);
369 }
370
371 /**
372 * Constructs a tokenizer splitting using the specified delimiter matcher.
373 *
374 * @param input the string which is to be parsed
375 * @param delim the field delimiter matcher
376 */
377 public StrTokenizer(final String input, final StrMatcher delim) {
378 this(input);
379 setDelimiterMatcher(delim);
380 }
381
382 /**
383 * Constructs a tokenizer splitting using the specified delimiter matcher
384 * and handling quotes using the specified quote matcher.
385 *
386 * @param input the string which is to be parsed
387 * @param delim the field delimiter matcher
388 * @param quote the field quoted string matcher
389 */
390 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
391 this(input, delim);
392 setQuoteMatcher(quote);
393 }
394
395 /**
396 * Unsupported ListIterator operation.
397 * @param obj this parameter ignored.
398 * @throws UnsupportedOperationException always
399 */
400 @Override
401 public void add(final String obj) {
402 throw new UnsupportedOperationException("add() is unsupported");
403 }
404
405 /**
406 * Adds a token to a list, paying attention to the parameters we've set.
407 *
408 * @param list the list to add to
409 * @param tok the token to add
410 */
411 private void addToken(final List<String> list, String tok) {
412 if (tok == null || tok.isEmpty()) {
413 if (isIgnoreEmptyTokens()) {
414 return;
415 }
416 if (isEmptyTokenAsNull()) {
417 tok = null;
418 }
419 }
420 list.add(tok);
421 }
422
423 /**
424 * Checks if tokenization has been done, and if not then do it.
425 */
426 private void checkTokenized() {
427 if (tokens == null) {
428 if (chars == null) {
429 // still call tokenize as subclass may do some work
430 final List<String> split = tokenize(null, 0, 0);
431 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
432 } else {
433 final List<String> split = tokenize(chars, 0, chars.length);
434 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
435 }
436 }
437 }
438
439 /**
440 * Creates a new instance of this Tokenizer. The new instance is reset so
441 * that it will be at the start of the token list.
442 * If a {@link CloneNotSupportedException} is caught, return {@code null}.
443 *
444 * @return a new instance of this Tokenizer which has been reset.
445 */
446 @Override
447 public Object clone() {
448 try {
449 return cloneReset();
450 } catch (final CloneNotSupportedException ex) {
451 return null;
452 }
453 }
454
455 /**
456 * Creates a new instance of this Tokenizer. The new instance is reset so that
457 * it will be at the start of the token list.
458 *
459 * @return a new instance of this Tokenizer which has been reset.
460 * @throws CloneNotSupportedException if there is a problem cloning
461 */
462 Object cloneReset() throws CloneNotSupportedException {
463 // this method exists to enable 100% test coverage
464 final StrTokenizer cloned = (StrTokenizer) super.clone();
465 if (cloned.chars != null) {
466 cloned.chars = cloned.chars.clone();
467 }
468 cloned.reset();
469 return cloned;
470 }
471
472 /**
473 * Gets the String content that the tokenizer is parsing.
474 *
475 * @return The string content being parsed
476 */
477 public String getContent() {
478 if (chars == null) {
479 return null;
480 }
481 return new String(chars);
482 }
483
484 /**
485 * Gets the field delimiter matcher.
486 *
487 * @return The delimiter matcher in use
488 */
489 public StrMatcher getDelimiterMatcher() {
490 return this.delimMatcher;
491 }
492
493 /**
494 * Gets the ignored character matcher.
495 * <p>
496 * These characters are ignored when parsing the String, unless they are
497 * within a quoted region.
498 * The default value is not to ignore anything.
499 * </p>
500 *
501 * @return The ignored matcher in use
502 */
503 public StrMatcher getIgnoredMatcher() {
504 return ignoredMatcher;
505 }
506
507 /**
508 * Gets the quote matcher currently in use.
509 * <p>
510 * The quote character is used to wrap data between the tokens.
511 * This enables delimiters to be entered as data.
512 * The default value is '"' (double quote).
513 * </p>
514 *
515 * @return The quote matcher in use
516 */
517 public StrMatcher getQuoteMatcher() {
518 return quoteMatcher;
519 }
520
521 /**
522 * Gets a copy of the full token list as an independent modifiable array.
523 *
524 * @return The tokens as a String array
525 */
526 public String[] getTokenArray() {
527 checkTokenized();
528 return tokens.clone();
529 }
530
531 /**
532 * Gets a copy of the full token list as an independent modifiable list.
533 *
534 * @return The tokens as a String array
535 */
536 public List<String> getTokenList() {
537 checkTokenized();
538 final List<String> list = new ArrayList<>(tokens.length);
539 Collections.addAll(list, tokens);
540
541 return list;
542 }
543
544 /**
545 * Gets the trimmer character matcher.
546 * <p>
547 * These characters are trimmed off on each side of the delimiter
548 * until the token or quote is found.
549 * The default value is not to trim anything.
550 * </p>
551 *
552 * @return The trimmer matcher in use
553 */
554 public StrMatcher getTrimmerMatcher() {
555 return trimmerMatcher;
556 }
557
558 /**
559 * Checks whether there are any more tokens.
560 *
561 * @return true if there are more tokens
562 */
563 @Override
564 public boolean hasNext() {
565 checkTokenized();
566 return tokenPos < tokens.length;
567 }
568
569 /**
570 * Checks whether there are any previous tokens that can be iterated to.
571 *
572 * @return true if there are previous tokens
573 */
574 @Override
575 public boolean hasPrevious() {
576 checkTokenized();
577 return tokenPos > 0;
578 }
579
580 /**
581 * Gets whether the tokenizer currently returns empty tokens as null.
582 * The default for this property is false.
583 *
584 * @return true if empty tokens are returned as null
585 */
586 public boolean isEmptyTokenAsNull() {
587 return this.emptyAsNull;
588 }
589
590 /**
591 * Gets whether the tokenizer currently ignores empty tokens.
592 * The default for this property is true.
593 *
594 * @return true if empty tokens are not returned
595 */
596 public boolean isIgnoreEmptyTokens() {
597 return ignoreEmptyTokens;
598 }
599
600 /**
601 * Checks if the characters at the index specified match the quote
602 * already matched in readNextToken().
603 *
604 * @param srcChars the character array being tokenized
605 * @param pos the position to check for a quote
606 * @param len the length of the character array being tokenized
607 * @param quoteStart the start position of the matched quote, 0 if no quoting
608 * @param quoteLen the length of the matched quote, 0 if no quoting
609 * @return true if a quote is matched
610 */
611 private boolean isQuote(final char[] srcChars,
612 final int pos,
613 final int len,
614 final int quoteStart,
615 final int quoteLen) {
616 for (int i = 0; i < quoteLen; i++) {
617 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
618 return false;
619 }
620 }
621 return true;
622 }
623
624 /**
625 * Gets the next token.
626 *
627 * @return The next String token
628 * @throws NoSuchElementException if there are no more elements
629 */
630 @Override
631 public String next() {
632 if (hasNext()) {
633 return tokens[tokenPos++];
634 }
635 throw new NoSuchElementException();
636 }
637
638 /**
639 * Gets the index of the next token to return.
640 *
641 * @return The next token index
642 */
643 @Override
644 public int nextIndex() {
645 return tokenPos;
646 }
647
648 /**
649 * Gets the next token from the String.
650 * Equivalent to {@link #next()} except it returns null rather than
651 * throwing {@link NoSuchElementException} when no tokens remain.
652 *
653 * @return The next sequential token, or null when no more tokens are found
654 */
655 public String nextToken() {
656 if (hasNext()) {
657 return tokens[tokenPos++];
658 }
659 return null;
660 }
661
662 /**
663 * Gets the token previous to the last returned token.
664 *
665 * @return The previous token
666 */
667 @Override
668 public String previous() {
669 if (hasPrevious()) {
670 return tokens[--tokenPos];
671 }
672 throw new NoSuchElementException();
673 }
674
675 /**
676 * Gets the index of the previous token.
677 *
678 * @return The previous token index
679 */
680 @Override
681 public int previousIndex() {
682 return tokenPos - 1;
683 }
684
685 /**
686 * Gets the previous token from the String.
687 *
688 * @return The previous sequential token, or null when no more tokens are found
689 */
690 public String previousToken() {
691 if (hasPrevious()) {
692 return tokens[--tokenPos];
693 }
694 return null;
695 }
696
697 /**
698 * Reads character by character through the String to get the next token.
699 *
700 * @param srcChars the character array being tokenized
701 * @param start the first character of field
702 * @param len the length of the character array being tokenized
703 * @param workArea a temporary work area
704 * @param tokenList the list of parsed tokens
705 * @return The starting position of the next field (the character
706 * immediately after the delimiter), or -1 if end of string found
707 */
708 private int readNextToken(final char[] srcChars,
709 int start,
710 final int len,
711 final StrBuilder workArea,
712 final List<String> tokenList) {
713 // skip all leading whitespace, unless it is the
714 // field delimiter or the quote character
715 while (start < len) {
716 final int removeLen = Math.max(
717 getIgnoredMatcher().isMatch(srcChars, start, start, len),
718 getTrimmerMatcher().isMatch(srcChars, start, start, len));
719 if (removeLen == 0
720 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
721 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
722 break;
723 }
724 start += removeLen;
725 }
726
727 // handle reaching end
728 if (start >= len) {
729 addToken(tokenList, StringUtils.EMPTY);
730 return -1;
731 }
732
733 // handle empty token
734 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
735 if (delimLen > 0) {
736 addToken(tokenList, StringUtils.EMPTY);
737 return start + delimLen;
738 }
739
740 // handle found token
741 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
742 if (quoteLen > 0) {
743 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
744 }
745 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
746 }
747
748 /**
749 * Reads a possibly quoted string token.
750 *
751 * @param srcChars the character array being tokenized
752 * @param start the first character of field
753 * @param len the length of the character array being tokenized
754 * @param workArea a temporary work area
755 * @param tokenList the list of parsed tokens
756 * @param quoteStart the start position of the matched quote, 0 if no quoting
757 * @param quoteLen the length of the matched quote, 0 if no quoting
758 * @return The starting position of the next field (the character
759 * immediately after the delimiter, or if end of string found,
760 * then the length of string
761 */
762 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
763 final List<String> tokenList, final int quoteStart, final int quoteLen) {
764 // Loop until we've found the end of the quoted
765 // string or the end of the input
766 workArea.clear();
767 int pos = start;
768 boolean quoting = quoteLen > 0;
769 int trimStart = 0;
770
771 while (pos < len) {
772 // quoting mode can occur several times throughout a string
773 // we must switch between quoting and non-quoting until we
774 // encounter a non-quoted delimiter, or end of string
775 if (quoting) {
776 // In quoting mode
777
778 // If we've found a quote character, see if it's
779 // followed by a second quote. If so, then we need
780 // to actually put the quote character into the token
781 // rather than end the token.
782 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
783 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
784 // matched pair of quotes, thus an escaped quote
785 workArea.append(srcChars, pos, quoteLen);
786 pos += quoteLen * 2;
787 trimStart = workArea.size();
788 continue;
789 }
790
791 // end of quoting
792 quoting = false;
793 pos += quoteLen;
794 continue;
795 }
796
797 } else {
798 // Not in quoting mode
799
800 // check for delimiter, and thus end of token
801 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
802 if (delimLen > 0) {
803 // return condition when end of token found
804 addToken(tokenList, workArea.substring(0, trimStart));
805 return pos + delimLen;
806 }
807
808 // check for quote, and thus back into quoting mode
809 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
810 quoting = true;
811 pos += quoteLen;
812 continue;
813 }
814
815 // check for ignored (outside quotes), and ignore
816 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
817 if (ignoredLen > 0) {
818 pos += ignoredLen;
819 continue;
820 }
821
822 // check for trimmed character
823 // don't yet know if its at the end, so copy to workArea
824 // use trimStart to keep track of trim at the end
825 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
826 if (trimmedLen > 0) {
827 workArea.append(srcChars, pos, trimmedLen);
828 pos += trimmedLen;
829 continue;
830 }
831
832 }
833 // copy regular character from inside quotes
834 workArea.append(srcChars[pos++]);
835 trimStart = workArea.size();
836 }
837
838 // return condition when end of string found
839 addToken(tokenList, workArea.substring(0, trimStart));
840 return -1;
841 }
842
843 /**
844 * Unsupported ListIterator operation.
845 *
846 * @throws UnsupportedOperationException always
847 */
848 @Override
849 public void remove() {
850 throw new UnsupportedOperationException("remove() is unsupported");
851 }
852
853 /**
854 * Resets this tokenizer, forgetting all parsing and iteration already completed.
855 * <p>
856 * This method allows the same tokenizer to be reused for the same String.
857 *
858 * @return this, to enable chaining
859 */
860 public StrTokenizer reset() {
861 tokenPos = 0;
862 tokens = null;
863 return this;
864 }
865
866 /**
867 * Reset this tokenizer, giving it a new input string to parse.
868 * In this manner you can re-use a tokenizer with the same settings
869 * on multiple input lines.
870 *
871 * @param input the new character array to tokenize, not cloned, null sets no text to parse
872 * @return this, to enable chaining
873 */
874 public StrTokenizer reset(final char[] input) {
875 reset();
876 if (input != null) {
877 this.chars = input.clone();
878 } else {
879 this.chars = null;
880 }
881 return this;
882 }
883
884 /**
885 * Reset this tokenizer, giving it a new input string to parse.
886 * In this manner you can re-use a tokenizer with the same settings
887 * on multiple input lines.
888 *
889 * @param input the new string to tokenize, null sets no text to parse
890 * @return this, to enable chaining
891 */
892 public StrTokenizer reset(final String input) {
893 reset();
894 if (input != null) {
895 this.chars = input.toCharArray();
896 } else {
897 this.chars = null;
898 }
899 return this;
900 }
901
902 /**
903 * Unsupported ListIterator operation.
904 * @param obj this parameter ignored.
905 * @throws UnsupportedOperationException always
906 */
907 @Override
908 public void set(final String obj) {
909 throw new UnsupportedOperationException("set() is unsupported");
910 }
911
912 /**
913 * Sets the field delimiter character.
914 *
915 * @param delim the delimiter character to use
916 * @return this, to enable chaining
917 */
918 public StrTokenizer setDelimiterChar(final char delim) {
919 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
920 }
921
922 /**
923 * Sets the field delimiter matcher.
924 * <p>
925 * The delimiter is used to separate one token from another.
926 * </p>
927 *
928 * @param delim the delimiter matcher to use
929 * @return this, to enable chaining
930 */
931 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
932 if (delim == null) {
933 this.delimMatcher = StrMatcher.noneMatcher();
934 } else {
935 this.delimMatcher = delim;
936 }
937 return this;
938 }
939
940 /**
941 * Sets the field delimiter string.
942 *
943 * @param delim the delimiter string to use
944 * @return this, to enable chaining
945 */
946 public StrTokenizer setDelimiterString(final String delim) {
947 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
948 }
949
950 /**
951 * Sets whether the tokenizer should return empty tokens as null.
952 * The default for this property is false.
953 *
954 * @param emptyAsNull whether empty tokens are returned as null
955 * @return this, to enable chaining
956 */
957 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
958 this.emptyAsNull = emptyAsNull;
959 return this;
960 }
961
962 /**
963 * Sets the character to ignore.
964 * <p>
965 * This character is ignored when parsing the String, unless it is
966 * within a quoted region.
967 * </p>
968 *
969 * @param ignored the ignored character to use
970 * @return this, to enable chaining
971 */
972 public StrTokenizer setIgnoredChar(final char ignored) {
973 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
974 }
975
976 /**
977 * Sets the matcher for characters to ignore.
978 * <p>
979 * These characters are ignored when parsing the String, unless they are
980 * within a quoted region.
981 * </p>
982 *
983 * @param ignored the ignored matcher to use, null ignored
984 * @return this, to enable chaining
985 */
986 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
987 if (ignored != null) {
988 this.ignoredMatcher = ignored;
989 }
990 return this;
991 }
992
993 /**
994 * Sets whether the tokenizer should ignore and not return empty tokens.
995 * The default for this property is true.
996 *
997 * @param ignoreEmptyTokens whether empty tokens are not returned
998 * @return this, to enable chaining
999 */
1000 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1001 this.ignoreEmptyTokens = ignoreEmptyTokens;
1002 return this;
1003 }
1004
1005 /**
1006 * Sets the quote character to use.
1007 * <p>
1008 * The quote character is used to wrap data between the tokens.
1009 * This enables delimiters to be entered as data.
1010 * </p>
1011 *
1012 * @param quote the quote character to use
1013 * @return this, to enable chaining
1014 */
1015 public StrTokenizer setQuoteChar(final char quote) {
1016 return setQuoteMatcher(StrMatcher.charMatcher(quote));
1017 }
1018
1019 /**
1020 * Sets the quote matcher to use.
1021 * <p>
1022 * The quote character is used to wrap data between the tokens.
1023 * This enables delimiters to be entered as data.
1024 * </p>
1025 *
1026 * @param quote the quote matcher to use, null ignored
1027 * @return this, to enable chaining
1028 */
1029 public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1030 if (quote != null) {
1031 this.quoteMatcher = quote;
1032 }
1033 return this;
1034 }
1035
1036 /**
1037 * Sets the matcher for characters to trim.
1038 * <p>
1039 * These characters are trimmed off on each side of the delimiter
1040 * until the token or quote is found.
1041 * </p>
1042 *
1043 * @param trimmer the trimmer matcher to use, null ignored
1044 * @return this, to enable chaining
1045 */
1046 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1047 if (trimmer != null) {
1048 this.trimmerMatcher = trimmer;
1049 }
1050 return this;
1051 }
1052
1053 /**
1054 * Gets the number of tokens found in the String.
1055 *
1056 * @return The number of matched tokens
1057 */
1058 public int size() {
1059 checkTokenized();
1060 return tokens.length;
1061 }
1062
1063 /**
1064 * Internal method to performs the tokenization.
1065 * <p>
1066 * Most users of this class do not need to call this method. This method
1067 * will be called automatically by other (public) methods when required.
1068 * </p>
1069 * <p>
1070 * This method exists to allow subclasses to add code before or after the
1071 * tokenization. For example, a subclass could alter the character array,
1072 * offset or count to be parsed, or call the tokenizer multiple times on
1073 * multiple strings. It is also be possible to filter the results.
1074 * </p>
1075 * <p>
1076 * {@code StrTokenizer} will always pass a zero offset and a count
1077 * equal to the length of the array to this method, however a subclass
1078 * may pass other values, or even an entirely different array.
1079 * </p>
1080 *
1081 * @param srcChars the character array being tokenized, may be null
1082 * @param offset the start position within the character array, must be valid
1083 * @param count the number of characters to tokenize, must be valid
1084 * @return The modifiable list of String tokens, unmodifiable if null array or zero count
1085 */
1086 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1087 if (srcChars == null || count == 0) {
1088 return Collections.emptyList();
1089 }
1090 final StrBuilder buf = new StrBuilder();
1091 final List<String> tokenList = new ArrayList<>();
1092 int pos = offset;
1093
1094 // loop around the entire buffer
1095 while (pos >= 0 && pos < count) {
1096 // find next token
1097 pos = readNextToken(srcChars, pos, count, buf, tokenList);
1098
1099 // handle case where end of string is a delimiter
1100 if (pos >= count) {
1101 addToken(tokenList, StringUtils.EMPTY);
1102 }
1103 }
1104 return tokenList;
1105 }
1106
1107 /**
1108 * Gets the String content that the tokenizer is parsing.
1109 *
1110 * @return The string content being parsed
1111 */
1112 @Override
1113 public String toString() {
1114 if (tokens == null) {
1115 return "StrTokenizer[not tokenized yet]";
1116 }
1117 return "StrTokenizer" + getTokenList();
1118 }
1119
1120 }