1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.lang3.text;
18
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.List;
23 import java.util.ListIterator;
24 import java.util.NoSuchElementException;
25 import java.util.StringTokenizer;
26
27 import org.apache.commons.lang3.ArrayUtils;
28 import org.apache.commons.lang3.StringUtils;
29
30 /**
31 * Tokenizes a string based on delimiters (separators)
32 * and supporting quoting and ignored character concepts.
33 * <p>
34 * This class can split a String into many smaller strings. It aims
35 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
36 * however it offers much more control and flexibility including implementing
37 * the {@link ListIterator} interface. By default, it is set up
38 * like {@link StringTokenizer}.
39 * </p>
40 * <p>
41 * The input String is split into a number of <em>tokens</em>.
42 * Each token is separated from the next String by a <em>delimiter</em>.
43 * One or more delimiter characters must be specified.
44 * </p>
45 * <p>
46 * Each token may be surrounded by quotes.
47 * The <em>quote</em> matcher specifies the quote character(s).
48 * A quote may be escaped within a quoted section by duplicating itself.
49 * </p>
50 * <p>
51 * Between each token and the delimiter are potentially characters that need trimming.
52 * The <em>trimmer</em> matcher specifies these characters.
53 * One usage might be to trim whitespace characters.
54 * </p>
55 * <p>
56 * At any point outside the quotes there might potentially be invalid characters.
57 * The <em>ignored</em> matcher specifies these characters to be removed.
58 * One usage might be to remove new line characters.
59 * </p>
60 * <p>
61 * Empty tokens may be removed or returned as null.
62 * </p>
63 * <pre>
64 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
65 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
66 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
67 * </pre>
68 *
69 * <table>
70 * <caption>StrTokenizer properties and options</caption>
71 * <tr>
72 * <th>Property</th><th>Type</th><th>Default</th>
73 * </tr>
74 * <tr>
75 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
76 * </tr>
77 * <tr>
78 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
79 * </tr>
80 * <tr>
81 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
82 * </tr>
83 * <tr>
84 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
85 * </tr>
86 * <tr>
87 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
88 * </tr>
89 * </table>
90 *
91 * @since 2.2
92 * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text
93 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
94 * StringTokenizer</a>.
95 */
96 @Deprecated
97 public class StrTokenizer implements ListIterator<String>, Cloneable {
98
99 // @formatter:off
100 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
101 .setDelimiterMatcher(StrMatcher.commaMatcher())
102 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
103 .setIgnoredMatcher(StrMatcher.noneMatcher())
104 .setTrimmerMatcher(StrMatcher.trimMatcher())
105 .setEmptyTokenAsNull(false)
106 .setIgnoreEmptyTokens(false);
107 // @formatter:on
108
109 // @formatter:off
110 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
111 .setDelimiterMatcher(StrMatcher.tabMatcher())
112 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
113 .setIgnoredMatcher(StrMatcher.noneMatcher())
114 .setTrimmerMatcher(StrMatcher.trimMatcher())
115 .setEmptyTokenAsNull(false)
116 .setIgnoreEmptyTokens(false);
117 // @formatter:on
118
119 /**
120 * Gets a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121 *
122 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123 */
124 private static StrTokenizer getCSVClone() {
125 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126 }
127 /**
128 * Gets a new tokenizer instance which parses Comma Separated Value strings
129 * initializing it with the given input. The default for CSV processing
130 * will be trim whitespace from both ends (which can be overridden with
131 * the setTrimmer method).
132 * <p>
133 * You must call a "reset" method to set the string which you want to parse.
134 * </p>
135 * @return a new tokenizer instance which parses Comma Separated Value strings.
136 */
137 public static StrTokenizer getCSVInstance() {
138 return getCSVClone();
139 }
140 /**
141 * Gets a new tokenizer instance which parses Comma Separated Value strings
142 * initializing it with the given input. The default for CSV processing
143 * will be trim whitespace from both ends (which can be overridden with
144 * the setTrimmer method).
145 *
146 * @param input the text to parse.
147 * @return a new tokenizer instance which parses Comma Separated Value strings.
148 */
149 public static StrTokenizer getCSVInstance(final char[] input) {
150 final StrTokenizer tok = getCSVClone();
151 tok.reset(input);
152 return tok;
153 }
154
155 /**
156 * Gets a new tokenizer instance which parses Comma Separated Value strings
157 * initializing it with the given input. The default for CSV processing
158 * will be trim whitespace from both ends (which can be overridden with
159 * the setTrimmer method).
160 *
161 * @param input the text to parse.
162 * @return a new tokenizer instance which parses Comma Separated Value strings.
163 */
164 public static StrTokenizer getCSVInstance(final String input) {
165 final StrTokenizer tok = getCSVClone();
166 tok.reset(input);
167 return tok;
168 }
169 /**
170 * Gets a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171 *
172 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
173 */
174 private static StrTokenizer getTSVClone() {
175 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
176 }
177
178 /**
179 * Gets a new tokenizer instance which parses Tab Separated Value strings.
180 * The default for CSV processing will be trim whitespace from both ends
181 * (which can be overridden with the setTrimmer method).
182 * <p>
183 * You must call a "reset" method to set the string which you want to parse.
184 * </p>
185 * @return a new tokenizer instance which parses Tab Separated Value strings.
186 */
187 public static StrTokenizer getTSVInstance() {
188 return getTSVClone();
189 }
190
191 /**
192 * Gets a new tokenizer instance which parses Tab Separated Value strings.
193 * The default for CSV processing will be trim whitespace from both ends
194 * (which can be overridden with the setTrimmer method).
195 *
196 * @param input the string to parse.
197 * @return a new tokenizer instance which parses Tab Separated Value strings.
198 */
199 public static StrTokenizer getTSVInstance(final char[] input) {
200 final StrTokenizer tok = getTSVClone();
201 tok.reset(input);
202 return tok;
203 }
204
205 /**
206 * Gets a new tokenizer instance which parses Tab Separated Value strings.
207 * The default for CSV processing will be trim whitespace from both ends
208 * (which can be overridden with the setTrimmer method).
209 *
210 * @param input the string to parse.
211 * @return a new tokenizer instance which parses Tab Separated Value strings.
212 */
213 public static StrTokenizer getTSVInstance(final String input) {
214 final StrTokenizer tok = getTSVClone();
215 tok.reset(input);
216 return tok;
217 }
218 /** The text to work on. */
219 private char[] chars;
220
221 /** The parsed tokens */
222 private String[] tokens;
223
224 /** The current iteration position */
225 private int tokenPos;
226
227 /** The delimiter matcher */
228 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
229
230 /** The quote matcher */
231 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
232
233 /** The ignored matcher */
234 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
235
236 /** The trimmer matcher */
237 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
238
239 /** Whether to return empty tokens as null */
240 private boolean emptyAsNull;
241
242 /** Whether to ignore empty tokens */
243 private boolean ignoreEmptyTokens = true;
244
245 /**
246 * Constructs a tokenizer splitting on space, tab, newline and formfeed
247 * as per StringTokenizer, but with no text to tokenize.
248 * <p>
249 * This constructor is normally used with {@link #reset(String)}.
250 * </p>
251 */
252 public StrTokenizer() {
253 this.chars = null;
254 }
255
256 /**
257 * Constructs a tokenizer splitting on space, tab, newline and formfeed
258 * as per StringTokenizer.
259 *
260 * @param input the string which is to be parsed, not cloned.
261 */
262 public StrTokenizer(final char[] input) {
263 this.chars = ArrayUtils.clone(input);
264 }
265
266 /**
267 * Constructs a tokenizer splitting on the specified character.
268 *
269 * @param input the string which is to be parsed, not cloned.
270 * @param delim the field delimiter character.
271 */
272 public StrTokenizer(final char[] input, final char delim) {
273 this(input);
274 setDelimiterChar(delim);
275 }
276
277 /**
278 * Constructs a tokenizer splitting on the specified delimiter character
279 * and handling quotes using the specified quote character.
280 *
281 * @param input the string which is to be parsed, not cloned.
282 * @param delim the field delimiter character.
283 * @param quote the field quoted string character.
284 */
285 public StrTokenizer(final char[] input, final char delim, final char quote) {
286 this(input, delim);
287 setQuoteChar(quote);
288 }
289
290 /**
291 * Constructs a tokenizer splitting on the specified string.
292 *
293 * @param input the string which is to be parsed, not cloned.
294 * @param delim the field delimiter string.
295 */
296 public StrTokenizer(final char[] input, final String delim) {
297 this(input);
298 setDelimiterString(delim);
299 }
300
301 /**
302 * Constructs a tokenizer splitting using the specified delimiter matcher.
303 *
304 * @param input the string which is to be parsed, not cloned.
305 * @param delim the field delimiter matcher.
306 */
307 public StrTokenizer(final char[] input, final StrMatcher delim) {
308 this(input);
309 setDelimiterMatcher(delim);
310 }
311
312 /**
313 * Constructs a tokenizer splitting using the specified delimiter matcher
314 * and handling quotes using the specified quote matcher.
315 *
316 * @param input the string which is to be parsed, not cloned.
317 * @param delim the field delimiter character.
318 * @param quote the field quoted string character.
319 */
320 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
321 this(input, delim);
322 setQuoteMatcher(quote);
323 }
324
325 /**
326 * Constructs a tokenizer splitting on space, tab, newline and formfeed
327 * as per StringTokenizer.
328 *
329 * @param input the string which is to be parsed.
330 */
331 public StrTokenizer(final String input) {
332 if (input != null) {
333 chars = input.toCharArray();
334 } else {
335 chars = null;
336 }
337 }
338
339 /**
340 * Constructs a tokenizer splitting on the specified delimiter character.
341 *
342 * @param input the string which is to be parsed.
343 * @param delim the field delimiter character.
344 */
345 public StrTokenizer(final String input, final char delim) {
346 this(input);
347 setDelimiterChar(delim);
348 }
349
350 /**
351 * Constructs a tokenizer splitting on the specified delimiter character
352 * and handling quotes using the specified quote character.
353 *
354 * @param input the string which is to be parsed.
355 * @param delim the field delimiter character.
356 * @param quote the field quoted string character.
357 */
358 public StrTokenizer(final String input, final char delim, final char quote) {
359 this(input, delim);
360 setQuoteChar(quote);
361 }
362
363 /**
364 * Constructs a tokenizer splitting on the specified delimiter string.
365 *
366 * @param input the string which is to be parsed.
367 * @param delim the field delimiter string.
368 */
369 public StrTokenizer(final String input, final String delim) {
370 this(input);
371 setDelimiterString(delim);
372 }
373
374 /**
375 * Constructs a tokenizer splitting using the specified delimiter matcher.
376 *
377 * @param input the string which is to be parsed.
378 * @param delim the field delimiter matcher.
379 */
380 public StrTokenizer(final String input, final StrMatcher delim) {
381 this(input);
382 setDelimiterMatcher(delim);
383 }
384
385 /**
386 * Constructs a tokenizer splitting using the specified delimiter matcher
387 * and handling quotes using the specified quote matcher.
388 *
389 * @param input the string which is to be parsed.
390 * @param delim the field delimiter matcher.
391 * @param quote the field quoted string matcher.
392 */
393 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
394 this(input, delim);
395 setQuoteMatcher(quote);
396 }
397
398 /**
399 * Unsupported ListIterator operation.
400 *
401 * @param obj this parameter ignored.
402 * @throws UnsupportedOperationException always.
403 */
404 @Override
405 public void add(final String obj) {
406 throw new UnsupportedOperationException("add() is unsupported");
407 }
408
409 /**
410 * Adds a token to a list, paying attention to the parameters we've set.
411 *
412 * @param list the list to add to.
413 * @param tok the token to add.
414 */
415 private void addToken(final List<String> list, String tok) {
416 if (StringUtils.isEmpty(tok)) {
417 if (isIgnoreEmptyTokens()) {
418 return;
419 }
420 if (isEmptyTokenAsNull()) {
421 tok = null;
422 }
423 }
424 list.add(tok);
425 }
426
427 /**
428 * Checks if tokenization has been done, and if not then do it.
429 */
430 private void checkTokenized() {
431 if (tokens == null) {
432 if (chars == null) {
433 // still call tokenize as subclass may do some work
434 final List<String> split = tokenize(null, 0, 0);
435 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
436 } else {
437 final List<String> split = tokenize(chars, 0, chars.length);
438 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
439 }
440 }
441 }
442
443 /**
444 * Creates a new instance of this Tokenizer. The new instance is reset so
445 * that it will be at the start of the token list.
446 * If a {@link CloneNotSupportedException} is caught, return {@code null}.
447 *
448 * @return a new instance of this Tokenizer which has been reset.
449 */
450 @Override
451 public Object clone() {
452 try {
453 return cloneReset();
454 } catch (final CloneNotSupportedException ex) {
455 return null;
456 }
457 }
458
459 /**
460 * Creates a new instance of this Tokenizer. The new instance is reset so that
461 * it will be at the start of the token list.
462 *
463 * @return a new instance of this Tokenizer which has been reset.
464 * @throws CloneNotSupportedException if there is a problem cloning.
465 */
466 Object cloneReset() throws CloneNotSupportedException {
467 // this method exists to enable 100% test coverage
468 final StrTokenizer cloned = (StrTokenizer) super.clone();
469 if (cloned.chars != null) {
470 cloned.chars = cloned.chars.clone();
471 }
472 cloned.reset();
473 return cloned;
474 }
475
476 /**
477 * Gets the String content that the tokenizer is parsing.
478 *
479 * @return the string content being parsed.
480 */
481 public String getContent() {
482 if (chars == null) {
483 return null;
484 }
485 return new String(chars);
486 }
487
488 /**
489 * Gets the field delimiter matcher.
490 *
491 * @return the delimiter matcher in use.
492 */
493 public StrMatcher getDelimiterMatcher() {
494 return this.delimMatcher;
495 }
496
497 // Ignored
498 /**
499 * Gets the ignored character matcher.
500 * <p>
501 * These characters are ignored when parsing the String, unless they are
502 * within a quoted region.
503 * The default value is not to ignore anything.
504 * </p>
505 *
506 * @return the ignored matcher in use.
507 */
508 public StrMatcher getIgnoredMatcher() {
509 return ignoredMatcher;
510 }
511
512 /**
513 * Gets the quote matcher currently in use.
514 * <p>
515 * The quote character is used to wrap data between the tokens.
516 * This enables delimiters to be entered as data.
517 * The default value is '"' (double quote).
518 * </p>
519 *
520 * @return the quote matcher in use.
521 */
522 public StrMatcher getQuoteMatcher() {
523 return quoteMatcher;
524 }
525
526 /**
527 * Gets a copy of the full token list as an independent modifiable array.
528 *
529 * @return the tokens as a String array.
530 */
531 public String[] getTokenArray() {
532 checkTokenized();
533 return tokens.clone();
534 }
535
536 /**
537 * Gets a copy of the full token list as an independent modifiable list.
538 *
539 * @return the tokens as a String array.
540 */
541 public List<String> getTokenList() {
542 checkTokenized();
543 final List<String> list = new ArrayList<>(tokens.length);
544 list.addAll(Arrays.asList(tokens));
545 return list;
546 }
547
548 /**
549 * Gets the trimmer character matcher.
550 * <p>
551 * These characters are trimmed off on each side of the delimiter
552 * until the token or quote is found.
553 * The default value is not to trim anything.
554 * </p>
555 *
556 * @return the trimmer matcher in use.
557 */
558 public StrMatcher getTrimmerMatcher() {
559 return trimmerMatcher;
560 }
561
562 /**
563 * Checks whether there are any more tokens.
564 *
565 * @return true if there are more tokens.
566 */
567 @Override
568 public boolean hasNext() {
569 checkTokenized();
570 return tokenPos < tokens.length;
571 }
572
573 /**
574 * Checks whether there are any previous tokens that can be iterated to.
575 *
576 * @return true if there are previous tokens.
577 */
578 @Override
579 public boolean hasPrevious() {
580 checkTokenized();
581 return tokenPos > 0;
582 }
583
584 /**
585 * Gets whether the tokenizer currently returns empty tokens as null.
586 * The default for this property is false.
587 *
588 * @return true if empty tokens are returned as null.
589 */
590 public boolean isEmptyTokenAsNull() {
591 return this.emptyAsNull;
592 }
593
594 /**
595 * Gets whether the tokenizer currently ignores empty tokens.
596 * The default for this property is true.
597 *
598 * @return true if empty tokens are not returned.
599 */
600 public boolean isIgnoreEmptyTokens() {
601 return ignoreEmptyTokens;
602 }
603
604 /**
605 * Checks if the characters at the index specified match the quote
606 * already matched in readNextToken().
607 *
608 * @param srcChars the character array being tokenized.
609 * @param pos the position to check for a quote.
610 * @param len the length of the character array being tokenized.
611 * @param quoteStart the start position of the matched quote, 0 if no quoting.
612 * @param quoteLen the length of the matched quote, 0 if no quoting.
613 * @return true if a quote is matched.
614 */
615 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
616 for (int i = 0; i < quoteLen; i++) {
617 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
618 return false;
619 }
620 }
621 return true;
622 }
623
624 /**
625 * Gets the next token.
626 *
627 * @return the next String token.
628 * @throws NoSuchElementException if there are no more elements.
629 */
630 @Override
631 public String next() {
632 if (hasNext()) {
633 return tokens[tokenPos++];
634 }
635 throw new NoSuchElementException();
636 }
637
638 /**
639 * Gets the index of the next token to return.
640 *
641 * @return the next token index.
642 */
643 @Override
644 public int nextIndex() {
645 return tokenPos;
646 }
647
648 /**
649 * Gets the next token from the String.
650 * Equivalent to {@link #next()} except it returns null rather than
651 * throwing {@link NoSuchElementException} when no tokens remain.
652 *
653 * @return the next sequential token, or null when no more tokens are found.
654 */
655 public String nextToken() {
656 if (hasNext()) {
657 return tokens[tokenPos++];
658 }
659 return null;
660 }
661
662 /**
663 * Gets the token previous to the last returned token.
664 *
665 * @return the previous token.
666 */
667 @Override
668 public String previous() {
669 if (hasPrevious()) {
670 return tokens[--tokenPos];
671 }
672 throw new NoSuchElementException();
673 }
674
675 /**
676 * Gets the index of the previous token.
677 *
678 * @return the previous token index.
679 */
680 @Override
681 public int previousIndex() {
682 return tokenPos - 1;
683 }
684
685 /**
686 * Gets the previous token from the String.
687 *
688 * @return the previous sequential token, or null when no more tokens are found.
689 */
690 public String previousToken() {
691 if (hasPrevious()) {
692 return tokens[--tokenPos];
693 }
694 return null;
695 }
696
697 /**
698 * Reads character by character through the String to get the next token.
699 *
700 * @param srcChars the character array being tokenized.
701 * @param start the first character of field.
702 * @param len the length of the character array being tokenized.
703 * @param workArea a temporary work area.
704 * @param tokenList the list of parsed tokens.
705 * @return the starting position of the next field (the character
706 * immediately after the delimiter), or -1 if end of string found.
707 */
708 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
709 // skip all leading whitespace, unless it is the
710 // field delimiter or the quote character
711 while (start < len) {
712 final int removeLen = Math.max(
713 getIgnoredMatcher().isMatch(srcChars, start, start, len),
714 getTrimmerMatcher().isMatch(srcChars, start, start, len));
715 if (removeLen == 0 ||
716 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
717 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
718 break;
719 }
720 start += removeLen;
721 }
722
723 // handle reaching end
724 if (start >= len) {
725 addToken(tokenList, StringUtils.EMPTY);
726 return -1;
727 }
728
729 // handle empty token
730 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
731 if (delimLen > 0) {
732 addToken(tokenList, StringUtils.EMPTY);
733 return start + delimLen;
734 }
735
736 // handle found token
737 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
738 if (quoteLen > 0) {
739 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
740 }
741 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
742 }
743
744 /**
745 * Reads a possibly quoted string token.
746 *
747 * @param srcChars the character array being tokenized.
748 * @param start the first character of field.
749 * @param len the length of the character array being tokenized.
750 * @param workArea a temporary work area.
751 * @param tokenList the list of parsed tokens.
752 * @param quoteStart the start position of the matched quote, 0 if no quoting.
753 * @param quoteLen the length of the matched quote, 0 if no quoting.
754 * @return the starting position of the next field (the character
755 * immediately after the delimiter, or if end of string found,
756 * then the length of string.
757 */
758 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
759 final List<String> tokenList, final int quoteStart, final int quoteLen) {
760 // Loop until we've found the end of the quoted
761 // string or the end of the input
762 workArea.clear();
763 int pos = start;
764 boolean quoting = quoteLen > 0;
765 int trimStart = 0;
766
767 while (pos < len) {
768 // quoting mode can occur several times throughout a string
769 // we must switch between quoting and non-quoting until we
770 // encounter a non-quoted delimiter, or end of string
771 if (quoting) {
772 // In quoting mode
773
774 // If we've found a quote character, see if it's
775 // followed by a second quote. If so, then we need
776 // to actually put the quote character into the token
777 // rather than end the token.
778 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
779 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
780 // matched pair of quotes, thus an escaped quote
781 workArea.append(srcChars, pos, quoteLen);
782 pos += quoteLen * 2;
783 trimStart = workArea.size();
784 continue;
785 }
786
787 // end of quoting
788 quoting = false;
789 pos += quoteLen;
790 continue;
791 }
792
793 } else {
794 // Not in quoting mode
795
796 // check for delimiter, and thus end of token
797 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
798 if (delimLen > 0) {
799 // return condition when end of token found
800 addToken(tokenList, workArea.substring(0, trimStart));
801 return pos + delimLen;
802 }
803
804 // check for quote, and thus back into quoting mode
805 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
806 quoting = true;
807 pos += quoteLen;
808 continue;
809 }
810
811 // check for ignored (outside quotes), and ignore
812 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
813 if (ignoredLen > 0) {
814 pos += ignoredLen;
815 continue;
816 }
817
818 // check for trimmed character
819 // don't yet know if it's at the end, so copy to workArea
820 // use trimStart to keep track of trim at the end
821 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
822 if (trimmedLen > 0) {
823 workArea.append(srcChars, pos, trimmedLen);
824 pos += trimmedLen;
825 continue;
826 }
827 }
828 // copy regular character from inside quotes
829 workArea.append(srcChars[pos++]);
830 trimStart = workArea.size();
831 }
832
833 // return condition when end of string found
834 addToken(tokenList, workArea.substring(0, trimStart));
835 return -1;
836 }
837
838 /**
839 * Unsupported ListIterator operation.
840 *
841 * @throws UnsupportedOperationException always.
842 */
843 @Override
844 public void remove() {
845 throw new UnsupportedOperationException("remove() is unsupported");
846 }
847
848 /**
849 * Resets this tokenizer, forgetting all parsing and iteration already completed.
850 * <p>
851 * This method allows the same tokenizer to be reused for the same String.
852 * </p>
853 *
854 * @return {@code this} instance.
855 */
856 public StrTokenizer reset() {
857 tokenPos = 0;
858 tokens = null;
859 return this;
860 }
861
862 /**
863 * Reset this tokenizer, giving it a new input string to parse.
864 * In this manner you can re-use a tokenizer with the same settings
865 * on multiple input lines.
866 *
867 * @param input the new character array to tokenize, not cloned, null sets no text to parse.
868 * @return {@code this} instance.
869 */
870 public StrTokenizer reset(final char[] input) {
871 reset();
872 this.chars = ArrayUtils.clone(input);
873 return this;
874 }
875
876 /**
877 * Reset this tokenizer, giving it a new input string to parse.
878 * In this manner you can re-use a tokenizer with the same settings
879 * on multiple input lines.
880 *
881 * @param input the new string to tokenize, null sets no text to parse.
882 * @return {@code this} instance.
883 */
884 public StrTokenizer reset(final String input) {
885 reset();
886 if (input != null) {
887 this.chars = input.toCharArray();
888 } else {
889 this.chars = null;
890 }
891 return this;
892 }
893
894 /**
895 * Unsupported ListIterator operation.
896 *
897 * @param obj this parameter ignored.
898 * @throws UnsupportedOperationException always.
899 */
900 @Override
901 public void set(final String obj) {
902 throw new UnsupportedOperationException("set() is unsupported");
903 }
904
905 /**
906 * Sets the field delimiter character.
907 *
908 * @param delim the delimiter character to use.
909 * @return this, to enable chaining.
910 */
911 public StrTokenizer setDelimiterChar(final char delim) {
912 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
913 }
914
915 /**
916 * Sets the field delimiter matcher.
917 * <p>
918 * The delimiter is used to separate one token from another.
919 * </p>
920 *
921 * @param delim the delimiter matcher to use.
922 * @return this, to enable chaining.
923 */
924 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
925 if (delim == null) {
926 this.delimMatcher = StrMatcher.noneMatcher();
927 } else {
928 this.delimMatcher = delim;
929 }
930 return this;
931 }
932
933 /**
934 * Sets the field delimiter string.
935 *
936 * @param delim the delimiter string to use.
937 * @return this, to enable chaining.
938 */
939 public StrTokenizer setDelimiterString(final String delim) {
940 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
941 }
942
943 /**
944 * Sets whether the tokenizer should return empty tokens as null.
945 * The default for this property is false.
946 *
947 * @param emptyAsNull whether empty tokens are returned as null.
948 * @return this, to enable chaining.
949 */
950 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
951 this.emptyAsNull = emptyAsNull;
952 return this;
953 }
954
955 /**
956 * Sets the character to ignore.
957 * <p>
958 * This character is ignored when parsing the String, unless it is
959 * within a quoted region.
960 *
961 * @param ignored the ignored character to use.
962 * @return this, to enable chaining.
963 */
964 public StrTokenizer setIgnoredChar(final char ignored) {
965 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
966 }
967
968 /**
969 * Sets the matcher for characters to ignore.
970 * <p>
971 * These characters are ignored when parsing the String, unless they are
972 * within a quoted region.
973 * </p>
974 *
975 * @param ignored the ignored matcher to use, null ignored.
976 * @return {@code this} instance.
977 */
978 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
979 if (ignored != null) {
980 this.ignoredMatcher = ignored;
981 }
982 return this;
983 }
984
985 /**
986 * Sets whether the tokenizer should ignore and not return empty tokens.
987 * The default for this property is true.
988 *
989 * @param ignoreEmptyTokens whether empty tokens are not returned.
990 * @return {@code this} instance.
991 */
992 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
993 this.ignoreEmptyTokens = ignoreEmptyTokens;
994 return this;
995 }
996
997 /**
998 * Sets the quote character to use.
999 * <p>
1000 * The quote character is used to wrap data between the tokens.
1001 * This enables delimiters to be entered as data.
1002 * </p>
1003 *
1004 * @param quote the quote character to use.
1005 * @return {@code this} instance.
1006 */
1007 public StrTokenizer setQuoteChar(final char quote) {
1008 return setQuoteMatcher(StrMatcher.charMatcher(quote));
1009 }
1010
1011 /**
1012 * Sets the quote matcher to use.
1013 * <p>
1014 * The quote character is used to wrap data between the tokens.
1015 * This enables delimiters to be entered as data.
1016 * </p>
1017 *
1018 * @param quote the quote matcher to use, null ignored.
1019 * @return {@code this} instance.
1020 */
1021 public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1022 if (quote != null) {
1023 this.quoteMatcher = quote;
1024 }
1025 return this;
1026 }
1027
1028 /**
1029 * Sets the matcher for characters to trim.
1030 * <p>
1031 * These characters are trimmed off on each side of the delimiter
1032 * until the token or quote is found.
1033 * </p>
1034 *
1035 * @param trimmer the trimmer matcher to use, null ignored.
1036 * @return {@code this} instance.
1037 */
1038 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1039 if (trimmer != null) {
1040 this.trimmerMatcher = trimmer;
1041 }
1042 return this;
1043 }
1044
1045 // API
1046 /**
1047 * Gets the number of tokens found in the String.
1048 *
1049 * @return the number of matched tokens.
1050 */
1051 public int size() {
1052 checkTokenized();
1053 return tokens.length;
1054 }
1055
1056 /**
1057 * Internal method to performs the tokenization.
1058 * <p>
1059 * Most users of this class do not need to call this method. This method
1060 * will be called automatically by other (public) methods when required.
1061 * </p>
1062 * <p>
1063 * This method exists to allow subclasses to add code before or after the
1064 * tokenization. For example, a subclass could alter the character array,
1065 * offset or count to be parsed, or call the tokenizer multiple times on
1066 * multiple strings. It is also be possible to filter the results.
1067 * </p>
1068 * <p>
1069 * {@link StrTokenizer} will always pass a zero offset and a count
1070 * equal to the length of the array to this method, however a subclass
1071 * may pass other values, or even an entirely different array.
1072 * </p>
1073 *
1074 * @param srcChars the character array being tokenized, may be null.
1075 * @param offset the start position within the character array, must be valid.
1076 * @param count the number of characters to tokenize, must be valid.
1077 * @return the modifiable list of String tokens, unmodifiable if null array or zero count.
1078 */
1079 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1080 if (ArrayUtils.isEmpty(srcChars)) {
1081 return Collections.emptyList();
1082 }
1083 final StrBuilder buf = new StrBuilder();
1084 final List<String> tokenList = new ArrayList<>();
1085 int pos = offset;
1086
1087 // loop around the entire buffer
1088 while (pos >= 0 && pos < count) {
1089 // find next token
1090 pos = readNextToken(srcChars, pos, count, buf, tokenList);
1091
1092 // handle case where end of string is a delimiter
1093 if (pos >= count) {
1094 addToken(tokenList, StringUtils.EMPTY);
1095 }
1096 }
1097 return tokenList;
1098 }
1099
1100 /**
1101 * Gets the String content that the tokenizer is parsing.
1102 *
1103 * @return the string content being parsed.
1104 */
1105 @Override
1106 public String toString() {
1107 if (tokens == null) {
1108 return "StrTokenizer[not tokenized yet]";
1109 }
1110 return "StrTokenizer" + getTokenList();
1111 }
1112
1113 }