1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.lang3.text;
18
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.List;
23 import java.util.ListIterator;
24 import java.util.NoSuchElementException;
25 import java.util.StringTokenizer;
26
27 import org.apache.commons.lang3.ArrayUtils;
28 import org.apache.commons.lang3.StringUtils;
29
30 /**
31 * Tokenizes a string based on delimiters (separators)
32 * and supporting quoting and ignored character concepts.
33 * <p>
34 * This class can split a String into many smaller strings. It aims
35 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
36 * however it offers much more control and flexibility including implementing
37 * the {@link ListIterator} interface. By default, it is set up
38 * like {@link StringTokenizer}.
39 * </p>
40 * <p>
41 * The input String is split into a number of <em>tokens</em>.
42 * Each token is separated from the next String by a <em>delimiter</em>.
43 * One or more delimiter characters must be specified.
44 * </p>
45 * <p>
46 * Each token may be surrounded by quotes.
47 * The <em>quote</em> matcher specifies the quote character(s).
48 * A quote may be escaped within a quoted section by duplicating itself.
49 * </p>
50 * <p>
51 * Between each token and the delimiter are potentially characters that need trimming.
52 * The <em>trimmer</em> matcher specifies these characters.
53 * One usage might be to trim whitespace characters.
54 * </p>
55 * <p>
56 * At any point outside the quotes there might potentially be invalid characters.
57 * The <em>ignored</em> matcher specifies these characters to be removed.
58 * One usage might be to remove new line characters.
59 * </p>
60 * <p>
61 * Empty tokens may be removed or returned as null.
62 * </p>
63 * <pre>
64 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
65 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
66 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
67 * </pre>
68 *
69 * <table>
70 * <caption>StrTokenizer properties and options</caption>
71 * <tr>
72 * <th>Property</th><th>Type</th><th>Default</th>
73 * </tr>
74 * <tr>
75 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
76 * </tr>
77 * <tr>
78 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
79 * </tr>
80 * <tr>
81 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
82 * </tr>
83 * <tr>
84 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
85 * </tr>
86 * <tr>
87 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
88 * </tr>
89 * </table>
90 *
91 * @since 2.2
92 * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text
93 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
94 * StringTokenizer</a>.
95 */
96 @Deprecated
97 public class StrTokenizer implements ListIterator<String>, Cloneable {
98
99 // @formatter:off
100 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
101 .setDelimiterMatcher(StrMatcher.commaMatcher())
102 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
103 .setIgnoredMatcher(StrMatcher.noneMatcher())
104 .setTrimmerMatcher(StrMatcher.trimMatcher())
105 .setEmptyTokenAsNull(false)
106 .setIgnoreEmptyTokens(false);
107 // @formatter:on
108
109 // @formatter:off
110 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
111 .setDelimiterMatcher(StrMatcher.tabMatcher())
112 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
113 .setIgnoredMatcher(StrMatcher.noneMatcher())
114 .setTrimmerMatcher(StrMatcher.trimMatcher())
115 .setEmptyTokenAsNull(false)
116 .setIgnoreEmptyTokens(false);
117 // @formatter:on
118
119 /**
120 * Gets a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121 *
122 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123 */
124 private static StrTokenizer getCSVClone() {
125 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126 }
127
128 /**
129 * Gets a new tokenizer instance which parses Comma Separated Value strings
130 * initializing it with the given input. The default for CSV processing
131 * will be trim whitespace from both ends (which can be overridden with
132 * the setTrimmer method).
133 * <p>
134 * You must call a "reset" method to set the string which you want to parse.
135 * </p>
136 *
137 * @return a new tokenizer instance which parses Comma Separated Value strings.
138 */
139 public static StrTokenizer getCSVInstance() {
140 return getCSVClone();
141 }
142
143 /**
144 * Gets a new tokenizer instance which parses Comma Separated Value strings
145 * initializing it with the given input. The default for CSV processing
146 * will be trim whitespace from both ends (which can be overridden with
147 * the setTrimmer method).
148 *
149 * @param input the text to parse.
150 * @return a new tokenizer instance which parses Comma Separated Value strings.
151 */
152 public static StrTokenizer getCSVInstance(final char[] input) {
153 final StrTokenizer tok = getCSVClone();
154 tok.reset(input);
155 return tok;
156 }
157
158 /**
159 * Gets a new tokenizer instance which parses Comma Separated Value strings
160 * initializing it with the given input. The default for CSV processing
161 * will be trim whitespace from both ends (which can be overridden with
162 * the setTrimmer method).
163 *
164 * @param input the text to parse.
165 * @return a new tokenizer instance which parses Comma Separated Value strings.
166 */
167 public static StrTokenizer getCSVInstance(final String input) {
168 final StrTokenizer tok = getCSVClone();
169 tok.reset(input);
170 return tok;
171 }
172
173 /**
174 * Gets a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
175 *
176 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
177 */
178 private static StrTokenizer getTSVClone() {
179 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
180 }
181
182 /**
183 * Gets a new tokenizer instance which parses Tab Separated Value strings.
184 * The default for CSV processing will be trim whitespace from both ends
185 * (which can be overridden with the setTrimmer method).
186 * <p>
187 * You must call a "reset" method to set the string which you want to parse.
188 * </p>
189 *
190 * @return a new tokenizer instance which parses Tab Separated Value strings.
191 */
192 public static StrTokenizer getTSVInstance() {
193 return getTSVClone();
194 }
195
196 /**
197 * Gets a new tokenizer instance which parses Tab Separated Value strings.
198 * The default for CSV processing will be trim whitespace from both ends
199 * (which can be overridden with the setTrimmer method).
200 *
201 * @param input the string to parse.
202 * @return a new tokenizer instance which parses Tab Separated Value strings.
203 */
204 public static StrTokenizer getTSVInstance(final char[] input) {
205 final StrTokenizer tok = getTSVClone();
206 tok.reset(input);
207 return tok;
208 }
209
210 /**
211 * Gets a new tokenizer instance which parses Tab Separated Value strings.
212 * The default for CSV processing will be trim whitespace from both ends
213 * (which can be overridden with the setTrimmer method).
214 *
215 * @param input the string to parse.
216 * @return a new tokenizer instance which parses Tab Separated Value strings.
217 */
218 public static StrTokenizer getTSVInstance(final String input) {
219 final StrTokenizer tok = getTSVClone();
220 tok.reset(input);
221 return tok;
222 }
223
224 /** The text to work on. */
225 private char[] chars;
226
227 /** The parsed tokens */
228 private String[] tokens;
229
230 /** The current iteration position */
231 private int tokenPos;
232
233 /** The delimiter matcher */
234 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
235
236 /** The quote matcher */
237 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
238
239 /** The ignored matcher */
240 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
241
242 /** The trimmer matcher */
243 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
244
245 /** Whether to return empty tokens as null */
246 private boolean emptyAsNull;
247
248 /** Whether to ignore empty tokens */
249 private boolean ignoreEmptyTokens = true;
250
251 /**
252 * Constructs a tokenizer splitting on space, tab, newline and formfeed
253 * as per StringTokenizer, but with no text to tokenize.
254 * <p>
255 * This constructor is normally used with {@link #reset(String)}.
256 * </p>
257 */
258 public StrTokenizer() {
259 this.chars = null;
260 }
261
262 /**
263 * Constructs a tokenizer splitting on space, tab, newline and formfeed
264 * as per StringTokenizer.
265 *
266 * @param input the string which is to be parsed, not cloned.
267 */
268 public StrTokenizer(final char[] input) {
269 this.chars = ArrayUtils.clone(input);
270 }
271
272 /**
273 * Constructs a tokenizer splitting on the specified character.
274 *
275 * @param input the string which is to be parsed, not cloned.
276 * @param delim the field delimiter character.
277 */
278 public StrTokenizer(final char[] input, final char delim) {
279 this(input);
280 setDelimiterChar(delim);
281 }
282
283 /**
284 * Constructs a tokenizer splitting on the specified delimiter character
285 * and handling quotes using the specified quote character.
286 *
287 * @param input the string which is to be parsed, not cloned.
288 * @param delim the field delimiter character.
289 * @param quote the field quoted string character.
290 */
291 public StrTokenizer(final char[] input, final char delim, final char quote) {
292 this(input, delim);
293 setQuoteChar(quote);
294 }
295
296 /**
297 * Constructs a tokenizer splitting on the specified string.
298 *
299 * @param input the string which is to be parsed, not cloned.
300 * @param delim the field delimiter string.
301 */
302 public StrTokenizer(final char[] input, final String delim) {
303 this(input);
304 setDelimiterString(delim);
305 }
306
307 /**
308 * Constructs a tokenizer splitting using the specified delimiter matcher.
309 *
310 * @param input the string which is to be parsed, not cloned.
311 * @param delim the field delimiter matcher.
312 */
313 public StrTokenizer(final char[] input, final StrMatcher delim) {
314 this(input);
315 setDelimiterMatcher(delim);
316 }
317
318 /**
319 * Constructs a tokenizer splitting using the specified delimiter matcher
320 * and handling quotes using the specified quote matcher.
321 *
322 * @param input the string which is to be parsed, not cloned.
323 * @param delim the field delimiter character.
324 * @param quote the field quoted string character.
325 */
326 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
327 this(input, delim);
328 setQuoteMatcher(quote);
329 }
330
331 /**
332 * Constructs a tokenizer splitting on space, tab, newline and formfeed
333 * as per StringTokenizer.
334 *
335 * @param input the string which is to be parsed.
336 */
337 public StrTokenizer(final String input) {
338 if (input != null) {
339 chars = input.toCharArray();
340 } else {
341 chars = null;
342 }
343 }
344
345 /**
346 * Constructs a tokenizer splitting on the specified delimiter character.
347 *
348 * @param input the string which is to be parsed.
349 * @param delim the field delimiter character.
350 */
351 public StrTokenizer(final String input, final char delim) {
352 this(input);
353 setDelimiterChar(delim);
354 }
355
356 /**
357 * Constructs a tokenizer splitting on the specified delimiter character
358 * and handling quotes using the specified quote character.
359 *
360 * @param input the string which is to be parsed.
361 * @param delim the field delimiter character.
362 * @param quote the field quoted string character.
363 */
364 public StrTokenizer(final String input, final char delim, final char quote) {
365 this(input, delim);
366 setQuoteChar(quote);
367 }
368
369 /**
370 * Constructs a tokenizer splitting on the specified delimiter string.
371 *
372 * @param input the string which is to be parsed.
373 * @param delim the field delimiter string.
374 */
375 public StrTokenizer(final String input, final String delim) {
376 this(input);
377 setDelimiterString(delim);
378 }
379
380 /**
381 * Constructs a tokenizer splitting using the specified delimiter matcher.
382 *
383 * @param input the string which is to be parsed.
384 * @param delim the field delimiter matcher.
385 */
386 public StrTokenizer(final String input, final StrMatcher delim) {
387 this(input);
388 setDelimiterMatcher(delim);
389 }
390
391 /**
392 * Constructs a tokenizer splitting using the specified delimiter matcher
393 * and handling quotes using the specified quote matcher.
394 *
395 * @param input the string which is to be parsed.
396 * @param delim the field delimiter matcher.
397 * @param quote the field quoted string matcher.
398 */
399 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
400 this(input, delim);
401 setQuoteMatcher(quote);
402 }
403
404 /**
405 * Unsupported ListIterator operation.
406 *
407 * @param obj this parameter ignored.
408 * @throws UnsupportedOperationException always.
409 */
410 @Override
411 public void add(final String obj) {
412 throw new UnsupportedOperationException("add() is unsupported");
413 }
414
415 /**
416 * Adds a token to a list, paying attention to the parameters we've set.
417 *
418 * @param list the list to add to.
419 * @param tok the token to add.
420 */
421 private void addToken(final List<String> list, String tok) {
422 if (StringUtils.isEmpty(tok)) {
423 if (isIgnoreEmptyTokens()) {
424 return;
425 }
426 if (isEmptyTokenAsNull()) {
427 tok = null;
428 }
429 }
430 list.add(tok);
431 }
432
433 /**
434 * Checks if tokenization has been done, and if not then do it.
435 */
436 private void checkTokenized() {
437 if (tokens == null) {
438 if (chars == null) {
439 // still call tokenize as subclass may do some work
440 final List<String> split = tokenize(null, 0, 0);
441 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
442 } else {
443 final List<String> split = tokenize(chars, 0, chars.length);
444 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
445 }
446 }
447 }
448
449 /**
450 * Creates a new instance of this Tokenizer. The new instance is reset so
451 * that it will be at the start of the token list.
452 * If a {@link CloneNotSupportedException} is caught, return {@code null}.
453 *
454 * @return a new instance of this Tokenizer which has been reset.
455 */
456 @Override
457 public Object clone() {
458 try {
459 return cloneReset();
460 } catch (final CloneNotSupportedException ex) {
461 return null;
462 }
463 }
464
465 /**
466 * Creates a new instance of this Tokenizer. The new instance is reset so that
467 * it will be at the start of the token list.
468 *
469 * @return a new instance of this Tokenizer which has been reset.
470 * @throws CloneNotSupportedException if there is a problem cloning.
471 */
472 Object cloneReset() throws CloneNotSupportedException {
473 // this method exists to enable 100% test coverage
474 final StrTokenizer cloned = (StrTokenizer) super.clone();
475 if (cloned.chars != null) {
476 cloned.chars = cloned.chars.clone();
477 }
478 cloned.reset();
479 return cloned;
480 }
481
482 /**
483 * Gets the String content that the tokenizer is parsing.
484 *
485 * @return the string content being parsed.
486 */
487 public String getContent() {
488 if (chars == null) {
489 return null;
490 }
491 return new String(chars);
492 }
493
494 /**
495 * Gets the field delimiter matcher.
496 *
497 * @return the delimiter matcher in use.
498 */
499 public StrMatcher getDelimiterMatcher() {
500 return this.delimMatcher;
501 }
502
503 /**
504 * Gets the ignored character matcher.
505 * <p>
506 * These characters are ignored when parsing the String, unless they are
507 * within a quoted region.
508 * The default value is not to ignore anything.
509 * </p>
510 *
511 * @return the ignored matcher in use.
512 */
513 public StrMatcher getIgnoredMatcher() {
514 return ignoredMatcher;
515 }
516
517 /**
518 * Gets the quote matcher currently in use.
519 * <p>
520 * The quote character is used to wrap data between the tokens.
521 * This enables delimiters to be entered as data.
522 * The default value is '"' (double quote).
523 * </p>
524 *
525 * @return the quote matcher in use.
526 */
527 public StrMatcher getQuoteMatcher() {
528 return quoteMatcher;
529 }
530
531 /**
532 * Gets a copy of the full token list as an independent modifiable array.
533 *
534 * @return the tokens as a String array.
535 */
536 public String[] getTokenArray() {
537 checkTokenized();
538 return tokens.clone();
539 }
540
541 /**
542 * Gets a copy of the full token list as an independent modifiable list.
543 *
544 * @return the tokens as a String array.
545 */
546 public List<String> getTokenList() {
547 checkTokenized();
548 final List<String> list = new ArrayList<>(tokens.length);
549 list.addAll(Arrays.asList(tokens));
550 return list;
551 }
552
553 /**
554 * Gets the trimmer character matcher.
555 * <p>
556 * These characters are trimmed off on each side of the delimiter
557 * until the token or quote is found.
558 * The default value is not to trim anything.
559 * </p>
560 *
561 * @return the trimmer matcher in use.
562 */
563 public StrMatcher getTrimmerMatcher() {
564 return trimmerMatcher;
565 }
566
567 /**
568 * Checks whether there are any more tokens.
569 *
570 * @return true if there are more tokens.
571 */
572 @Override
573 public boolean hasNext() {
574 checkTokenized();
575 return tokenPos < tokens.length;
576 }
577
578 /**
579 * Checks whether there are any previous tokens that can be iterated to.
580 *
581 * @return true if there are previous tokens.
582 */
583 @Override
584 public boolean hasPrevious() {
585 checkTokenized();
586 return tokenPos > 0;
587 }
588
589 /**
590 * Gets whether the tokenizer currently returns empty tokens as null.
591 * The default for this property is false.
592 *
593 * @return true if empty tokens are returned as null.
594 */
595 public boolean isEmptyTokenAsNull() {
596 return this.emptyAsNull;
597 }
598
599 /**
600 * Gets whether the tokenizer currently ignores empty tokens.
601 * The default for this property is true.
602 *
603 * @return true if empty tokens are not returned.
604 */
605 public boolean isIgnoreEmptyTokens() {
606 return ignoreEmptyTokens;
607 }
608
609 /**
610 * Checks if the characters at the index specified match the quote
611 * already matched in readNextToken().
612 *
613 * @param srcChars the character array being tokenized.
614 * @param pos the position to check for a quote.
615 * @param len the length of the character array being tokenized.
616 * @param quoteStart the start position of the matched quote, 0 if no quoting.
617 * @param quoteLen the length of the matched quote, 0 if no quoting.
618 * @return true if a quote is matched.
619 */
620 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
621 for (int i = 0; i < quoteLen; i++) {
622 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
623 return false;
624 }
625 }
626 return true;
627 }
628
629 /**
630 * Gets the next token.
631 *
632 * @return the next String token.
633 * @throws NoSuchElementException if there are no more elements.
634 */
635 @Override
636 public String next() {
637 if (hasNext()) {
638 return tokens[tokenPos++];
639 }
640 throw new NoSuchElementException();
641 }
642
643 /**
644 * Gets the index of the next token to return.
645 *
646 * @return the next token index.
647 */
648 @Override
649 public int nextIndex() {
650 return tokenPos;
651 }
652
653 /**
654 * Gets the next token from the String.
655 * Equivalent to {@link #next()} except it returns null rather than
656 * throwing {@link NoSuchElementException} when no tokens remain.
657 *
658 * @return the next sequential token, or null when no more tokens are found.
659 */
660 public String nextToken() {
661 if (hasNext()) {
662 return tokens[tokenPos++];
663 }
664 return null;
665 }
666
667 /**
668 * Gets the token previous to the last returned token.
669 *
670 * @return the previous token.
671 */
672 @Override
673 public String previous() {
674 if (hasPrevious()) {
675 return tokens[--tokenPos];
676 }
677 throw new NoSuchElementException();
678 }
679
680 /**
681 * Gets the index of the previous token.
682 *
683 * @return the previous token index.
684 */
685 @Override
686 public int previousIndex() {
687 return tokenPos - 1;
688 }
689
690 /**
691 * Gets the previous token from the String.
692 *
693 * @return the previous sequential token, or null when no more tokens are found.
694 */
695 public String previousToken() {
696 if (hasPrevious()) {
697 return tokens[--tokenPos];
698 }
699 return null;
700 }
701
702 /**
703 * Reads character by character through the String to get the next token.
704 *
705 * @param srcChars the character array being tokenized.
706 * @param start the first character of field.
707 * @param len the length of the character array being tokenized.
708 * @param workArea a temporary work area.
709 * @param tokenList the list of parsed tokens.
710 * @return the starting position of the next field (the character
711 * immediately after the delimiter), or -1 if end of string found.
712 */
713 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
714 // skip all leading whitespace, unless it is the
715 // field delimiter or the quote character
716 while (start < len) {
717 final int removeLen = Math.max(
718 getIgnoredMatcher().isMatch(srcChars, start, start, len),
719 getTrimmerMatcher().isMatch(srcChars, start, start, len));
720 if (removeLen == 0 ||
721 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
722 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
723 break;
724 }
725 start += removeLen;
726 }
727
728 // handle reaching end
729 if (start >= len) {
730 addToken(tokenList, StringUtils.EMPTY);
731 return -1;
732 }
733
734 // handle empty token
735 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
736 if (delimLen > 0) {
737 addToken(tokenList, StringUtils.EMPTY);
738 return start + delimLen;
739 }
740
741 // handle found token
742 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
743 if (quoteLen > 0) {
744 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
745 }
746 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
747 }
748
749 /**
750 * Reads a possibly quoted string token.
751 *
752 * @param srcChars the character array being tokenized.
753 * @param start the first character of field.
754 * @param len the length of the character array being tokenized.
755 * @param workArea a temporary work area.
756 * @param tokenList the list of parsed tokens.
757 * @param quoteStart the start position of the matched quote, 0 if no quoting.
758 * @param quoteLen the length of the matched quote, 0 if no quoting.
759 * @return the starting position of the next field (the character
760 * immediately after the delimiter, or if end of string found,
761 * then the length of string.
762 */
763 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
764 final List<String> tokenList, final int quoteStart, final int quoteLen) {
765 // Loop until we've found the end of the quoted
766 // string or the end of the input
767 workArea.clear();
768 int pos = start;
769 boolean quoting = quoteLen > 0;
770 int trimStart = 0;
771
772 while (pos < len) {
773 // quoting mode can occur several times throughout a string
774 // we must switch between quoting and non-quoting until we
775 // encounter a non-quoted delimiter, or end of string
776 if (quoting) {
777 // In quoting mode
778
779 // If we've found a quote character, see if it's
780 // followed by a second quote. If so, then we need
781 // to actually put the quote character into the token
782 // rather than end the token.
783 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
784 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
785 // matched pair of quotes, thus an escaped quote
786 workArea.append(srcChars, pos, quoteLen);
787 pos += quoteLen * 2;
788 trimStart = workArea.size();
789 continue;
790 }
791
792 // end of quoting
793 quoting = false;
794 pos += quoteLen;
795 continue;
796 }
797
798 } else {
799 // Not in quoting mode
800
801 // check for delimiter, and thus end of token
802 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
803 if (delimLen > 0) {
804 // return condition when end of token found
805 addToken(tokenList, workArea.substring(0, trimStart));
806 return pos + delimLen;
807 }
808
809 // check for quote, and thus back into quoting mode
810 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
811 quoting = true;
812 pos += quoteLen;
813 continue;
814 }
815
816 // check for ignored (outside quotes), and ignore
817 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
818 if (ignoredLen > 0) {
819 pos += ignoredLen;
820 continue;
821 }
822
823 // check for trimmed character
824 // don't yet know if it's at the end, so copy to workArea
825 // use trimStart to keep track of trim at the end
826 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
827 if (trimmedLen > 0) {
828 workArea.append(srcChars, pos, trimmedLen);
829 pos += trimmedLen;
830 continue;
831 }
832 }
833 // copy regular character from inside quotes
834 workArea.append(srcChars[pos++]);
835 trimStart = workArea.size();
836 }
837
838 // return condition when end of string found
839 addToken(tokenList, workArea.substring(0, trimStart));
840 return -1;
841 }
842
843 /**
844 * Unsupported ListIterator operation.
845 *
846 * @throws UnsupportedOperationException always.
847 */
848 @Override
849 public void remove() {
850 throw new UnsupportedOperationException("remove() is unsupported");
851 }
852
853 /**
854 * Resets this tokenizer, forgetting all parsing and iteration already completed.
855 * <p>
856 * This method allows the same tokenizer to be reused for the same String.
857 * </p>
858 *
859 * @return {@code this} instance.
860 */
861 public StrTokenizer reset() {
862 tokenPos = 0;
863 tokens = null;
864 return this;
865 }
866
867 /**
868 * Reset this tokenizer, giving it a new input string to parse.
869 * In this manner you can re-use a tokenizer with the same settings
870 * on multiple input lines.
871 *
872 * @param input the new character array to tokenize, not cloned, null sets no text to parse.
873 * @return {@code this} instance.
874 */
875 public StrTokenizer reset(final char[] input) {
876 reset();
877 this.chars = ArrayUtils.clone(input);
878 return this;
879 }
880
881 /**
882 * Reset this tokenizer, giving it a new input string to parse.
883 * In this manner you can re-use a tokenizer with the same settings
884 * on multiple input lines.
885 *
886 * @param input the new string to tokenize, null sets no text to parse.
887 * @return {@code this} instance.
888 */
889 public StrTokenizer reset(final String input) {
890 reset();
891 if (input != null) {
892 this.chars = input.toCharArray();
893 } else {
894 this.chars = null;
895 }
896 return this;
897 }
898
899 /**
900 * Unsupported ListIterator operation.
901 *
902 * @param obj this parameter ignored.
903 * @throws UnsupportedOperationException always.
904 */
905 @Override
906 public void set(final String obj) {
907 throw new UnsupportedOperationException("set() is unsupported");
908 }
909
910 /**
911 * Sets the field delimiter character.
912 *
913 * @param delim the delimiter character to use.
914 * @return {@code this} instance.
915 */
916 public StrTokenizer setDelimiterChar(final char delim) {
917 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
918 }
919
920 /**
921 * Sets the field delimiter matcher.
922 * <p>
923 * The delimiter is used to separate one token from another.
924 * </p>
925 *
926 * @param delim the delimiter matcher to use.
927 * @return {@code this} instance.
928 */
929 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
930 if (delim == null) {
931 this.delimMatcher = StrMatcher.noneMatcher();
932 } else {
933 this.delimMatcher = delim;
934 }
935 return this;
936 }
937
938 /**
939 * Sets the field delimiter string.
940 *
941 * @param delim the delimiter string to use.
942 * @return {@code this} instance.
943 */
944 public StrTokenizer setDelimiterString(final String delim) {
945 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
946 }
947
948 /**
949 * Sets whether the tokenizer should return empty tokens as null.
950 * The default for this property is false.
951 *
952 * @param emptyAsNull whether empty tokens are returned as null.
953 * @return {@code this} instance.
954 */
955 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
956 this.emptyAsNull = emptyAsNull;
957 return this;
958 }
959
960 /**
961 * Sets the character to ignore.
962 * <p>
963 * This character is ignored when parsing the String, unless it is
964 * within a quoted region.
965 *
966 * @param ignored the ignored character to use.
967 * @return {@code this} instance.
968 */
969 public StrTokenizer setIgnoredChar(final char ignored) {
970 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
971 }
972
973 /**
974 * Sets the matcher for characters to ignore.
975 * <p>
976 * These characters are ignored when parsing the String, unless they are
977 * within a quoted region.
978 * </p>
979 *
980 * @param ignored the ignored matcher to use, null ignored.
981 * @return {@code this} instance.
982 */
983 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
984 if (ignored != null) {
985 this.ignoredMatcher = ignored;
986 }
987 return this;
988 }
989
990 /**
991 * Sets whether the tokenizer should ignore and not return empty tokens.
992 * The default for this property is true.
993 *
994 * @param ignoreEmptyTokens whether empty tokens are not returned.
995 * @return {@code this} instance.
996 */
997 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
998 this.ignoreEmptyTokens = ignoreEmptyTokens;
999 return this;
1000 }
1001
1002 /**
1003 * Sets the quote character to use.
1004 * <p>
1005 * The quote character is used to wrap data between the tokens.
1006 * This enables delimiters to be entered as data.
1007 * </p>
1008 *
1009 * @param quote the quote character to use.
1010 * @return {@code this} instance.
1011 */
1012 public StrTokenizer setQuoteChar(final char quote) {
1013 return setQuoteMatcher(StrMatcher.charMatcher(quote));
1014 }
1015
1016 /**
1017 * Sets the quote matcher to use.
1018 * <p>
1019 * The quote character is used to wrap data between the tokens.
1020 * This enables delimiters to be entered as data.
1021 * </p>
1022 *
1023 * @param quote the quote matcher to use, null ignored.
1024 * @return {@code this} instance.
1025 */
1026 public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1027 if (quote != null) {
1028 this.quoteMatcher = quote;
1029 }
1030 return this;
1031 }
1032
1033 /**
1034 * Sets the matcher for characters to trim.
1035 * <p>
1036 * These characters are trimmed off on each side of the delimiter
1037 * until the token or quote is found.
1038 * </p>
1039 *
1040 * @param trimmer the trimmer matcher to use, null ignored.
1041 * @return {@code this} instance.
1042 */
1043 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1044 if (trimmer != null) {
1045 this.trimmerMatcher = trimmer;
1046 }
1047 return this;
1048 }
1049
1050 /**
1051 * Gets the number of tokens found in the String.
1052 *
1053 * @return the number of matched tokens.
1054 */
1055 public int size() {
1056 checkTokenized();
1057 return tokens.length;
1058 }
1059
1060 /**
1061 * Internal method to performs the tokenization.
1062 * <p>
1063 * Most users of this class do not need to call this method. This method
1064 * will be called automatically by other (public) methods when required.
1065 * </p>
1066 * <p>
1067 * This method exists to allow subclasses to add code before or after the
1068 * tokenization. For example, a subclass could alter the character array,
1069 * offset or count to be parsed, or call the tokenizer multiple times on
1070 * multiple strings. It is also be possible to filter the results.
1071 * </p>
1072 * <p>
1073 * {@link StrTokenizer} will always pass a zero offset and a count
1074 * equal to the length of the array to this method, however a subclass
1075 * may pass other values, or even an entirely different array.
1076 * </p>
1077 *
1078 * @param srcChars the character array being tokenized, may be null.
1079 * @param offset the start position within the character array, must be valid.
1080 * @param count the number of characters to tokenize, must be valid.
1081 * @return the modifiable list of String tokens, unmodifiable if null array or zero count.
1082 */
1083 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1084 if (ArrayUtils.isEmpty(srcChars)) {
1085 return Collections.emptyList();
1086 }
1087 final StrBuilder buf = new StrBuilder();
1088 final List<String> tokenList = new ArrayList<>();
1089 int pos = offset;
1090
1091 // loop around the entire buffer
1092 while (pos >= 0 && pos < count) {
1093 // find next token
1094 pos = readNextToken(srcChars, pos, count, buf, tokenList);
1095
1096 // handle case where end of string is a delimiter
1097 if (pos >= count) {
1098 addToken(tokenList, StringUtils.EMPTY);
1099 }
1100 }
1101 return tokenList;
1102 }
1103
1104 /**
1105 * Gets the String content that the tokenizer is parsing.
1106 *
1107 * @return the string content being parsed.
1108 */
1109 @Override
1110 public String toString() {
1111 if (tokens == null) {
1112 return "StrTokenizer[not tokenized yet]";
1113 }
1114 return "StrTokenizer" + getTokenList();
1115 }
1116
1117 }