1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.text;
18
19 import java.util.ArrayList;
20 import java.util.Collections;
21 import java.util.List;
22 import java.util.ListIterator;
23 import java.util.NoSuchElementException;
24
25 import org.apache.commons.lang3.ArrayUtils;
26 import org.apache.commons.lang3.StringUtils;
27
28 /**
29 * Tokenizes a string based on delimiters (separators)
30 * and supporting quoting and ignored character concepts.
31 * <p>
32 * This class can split a String into many smaller strings. It aims
33 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
34 * however it offers much more control and flexibility including implementing
35 * the {@code ListIterator} interface. By default, it is set up
36 * like {@code StringTokenizer}.
37 * <p>
38 * The input String is split into a number of <em>tokens</em>.
39 * Each token is separated from the next String by a <em>delimiter</em>.
40 * One or more delimiter characters must be specified.
41 * <p>
42 * Each token may be surrounded by quotes.
43 * The <em>quote</em> matcher specifies the quote character(s).
44 * A quote may be escaped within a quoted section by duplicating itself.
45 * <p>
46 * Between each token and the delimiter are potentially characters that need trimming.
47 * The <em>trimmer</em> matcher specifies these characters.
48 * One usage might be to trim whitespace characters.
49 * <p>
50 * At any point outside the quotes there might potentially be invalid characters.
51 * The <em>ignored</em> matcher specifies these characters to be removed.
52 * One usage might be to remove new line characters.
53 * <p>
54 * Empty tokens may be removed or returned as null.
55 * <pre>
56 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
57 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
58 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
59 * </pre>
60 *
61 * <table>
62 * <caption>StrTokenizer properties and options</caption>
63 * <tr>
64 * <th>Property</th><th>Type</th><th>Default</th>
65 * </tr>
66 * <tr>
67 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
68 * </tr>
69 * <tr>
70 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
71 * </tr>
72 * <tr>
73 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
74 * </tr>
75 * <tr>
76 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
77 * </tr>
78 * <tr>
79 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
80 * </tr>
81 * </table>
82 *
83 * @since 1.0
84 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
85 */
86 @Deprecated
87 public class StrTokenizer implements ListIterator<String>, Cloneable {
88
89 /** Comma separated values tokenizer internal variable. */
90 // @formatter:off
91 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
92 .setDelimiterMatcher(StrMatcher.commaMatcher())
93 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
94 .setIgnoredMatcher(StrMatcher.noneMatcher())
95 .setTrimmerMatcher(StrMatcher.trimMatcher())
96 .setEmptyTokenAsNull(false)
97 .setIgnoreEmptyTokens(false);
98 // @formatter:on
99
100 /** Tab separated values tokenizer internal variable. */
101 // @formatter:off
102 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
103 .setDelimiterMatcher(StrMatcher.tabMatcher())
104 .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
105 .setIgnoredMatcher(StrMatcher.noneMatcher())
106 .setTrimmerMatcher(StrMatcher.trimMatcher())
107 .setEmptyTokenAsNull(false)
108 .setIgnoreEmptyTokens(false);
109 // @formatter:on
110
111 /**
112 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
113 *
114 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
115 */
116 private static StrTokenizer getCSVClone() {
117 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
118 }
119
120 /**
121 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
122 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
123 * <p>
124 * You must call a "reset" method to set the string which you want to parse.
125 * </p>
126 *
127 * @return a new tokenizer instance which parses Comma Separated Value strings.
128 */
129 public static StrTokenizer getCSVInstance() {
130 return getCSVClone();
131 }
132
133 /**
134 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
135 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
136 *
137 * @param input the text to parse.
138 * @return a new tokenizer instance which parses Comma Separated Value strings.
139 */
140 public static StrTokenizer getCSVInstance(final char[] input) {
141 final StrTokenizer tok = getCSVClone();
142 tok.reset(input);
143 return tok;
144 }
145
146 /**
147 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
148 * trim whitespace from both ends (which can be overridden with the setTrimmer method).
149 *
150 * @param input the text to parse.
151 * @return a new tokenizer instance which parses Comma Separated Value strings.
152 */
153 public static StrTokenizer getCSVInstance(final String input) {
154 final StrTokenizer tok = getCSVClone();
155 tok.reset(input);
156 return tok;
157 }
158
159 /**
160 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
161 *
162 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
163 */
164 private static StrTokenizer getTSVClone() {
165 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
166 }
167
168 /**
169 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
170 * be overridden with the setTrimmer method).
171 * <p>
172 * You must call a "reset" method to set the string which you want to parse.
173 * </p>
174 *
175 * @return a new tokenizer instance which parses Tab Separated Value strings.
176 */
177 public static StrTokenizer getTSVInstance() {
178 return getTSVClone();
179 }
180
181 /**
182 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
183 * be overridden with the setTrimmer method).
184 *
185 * @param input the string to parse.
186 * @return a new tokenizer instance which parses Tab Separated Value strings.
187 */
188 public static StrTokenizer getTSVInstance(final char[] input) {
189 final StrTokenizer tok = getTSVClone();
190 tok.reset(input);
191 return tok;
192 }
193
194 /**
195 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
196 * be overridden with the setTrimmer method).
197 *
198 * @param input the string to parse.
199 * @return a new tokenizer instance which parses Tab Separated Value strings.
200 */
201 public static StrTokenizer getTSVInstance(final String input) {
202 final StrTokenizer tok = getTSVClone();
203 tok.reset(input);
204 return tok;
205 }
206
207 /** The text to work on. */
208 private char[] chars;
209
210 /** The parsed tokens. */
211 private String[] tokens;
212
213 /** The current iteration position. */
214 private int tokenPos;
215
216 /** The delimiter matcher. */
217 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
218
219 /** The quote matcher. */
220 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
221
222 /** The ignored matcher. */
223 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
224
225 /** The trimmer matcher. */
226 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
227
228 /** Whether to return empty tokens as null. */
229 private boolean emptyAsNull;
230
231 /** Whether to ignore empty tokens. */
232 private boolean ignoreEmptyTokens = true;
233
234 /**
235 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize.
236 * <p>
237 * This constructor is normally used with {@link #reset(String)}.
238 * </p>
239 */
240 public StrTokenizer() {
241 this.chars = null;
242 }
243
244 /**
245 * Constructs a tokenizer splitting on space, tab, newline and form feed
246 * as per StringTokenizer.
247 *
248 * @param input the string which is to be parsed, not cloned.
249 */
250 public StrTokenizer(final char[] input) {
251 if (input == null) {
252 this.chars = null;
253 } else {
254 this.chars = input.clone();
255 }
256 }
257
258 /**
259 * Constructs a tokenizer splitting on the specified character.
260 *
261 * @param input the string which is to be parsed, not cloned.
262 * @param delim the field delimiter character.
263 */
264 public StrTokenizer(final char[] input, final char delim) {
265 this(input);
266 setDelimiterChar(delim);
267 }
268
269 /**
270 * Constructs a tokenizer splitting on the specified delimiter character
271 * and handling quotes using the specified quote character.
272 *
273 * @param input the string which is to be parsed, not cloned.
274 * @param delim the field delimiter character.
275 * @param quote the field quoted string character.
276 */
277 public StrTokenizer(final char[] input, final char delim, final char quote) {
278 this(input, delim);
279 setQuoteChar(quote);
280 }
281
282 /**
283 * Constructs a tokenizer splitting on the specified string.
284 *
285 * @param input the string which is to be parsed, not cloned.
286 * @param delim the field delimiter string.
287 */
288 public StrTokenizer(final char[] input, final String delim) {
289 this(input);
290 setDelimiterString(delim);
291 }
292
293 /**
294 * Constructs a tokenizer splitting using the specified delimiter matcher.
295 *
296 * @param input the string which is to be parsed, not cloned.
297 * @param delim the field delimiter matcher.
298 */
299 public StrTokenizer(final char[] input, final StrMatcher delim) {
300 this(input);
301 setDelimiterMatcher(delim);
302 }
303
304 /**
305 * Constructs a tokenizer splitting using the specified delimiter matcher
306 * and handling quotes using the specified quote matcher.
307 *
308 * @param input the string which is to be parsed, not cloned.
309 * @param delim the field delimiter character.
310 * @param quote the field quoted string character.
311 */
312 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
313 this(input, delim);
314 setQuoteMatcher(quote);
315 }
316
317 /**
318 * Constructs a tokenizer splitting on space, tab, newline and form feed
319 * as per StringTokenizer.
320 *
321 * @param input the string which is to be parsed.
322 */
323 public StrTokenizer(final String input) {
324 if (input != null) {
325 chars = input.toCharArray();
326 } else {
327 chars = null;
328 }
329 }
330
331 /**
332 * Constructs a tokenizer splitting on the specified delimiter character.
333 *
334 * @param input the string which is to be parsed.
335 * @param delim the field delimiter character.
336 */
337 public StrTokenizer(final String input, final char delim) {
338 this(input);
339 setDelimiterChar(delim);
340 }
341
342 /**
343 * Constructs a tokenizer splitting on the specified delimiter character
344 * and handling quotes using the specified quote character.
345 *
346 * @param input the string which is to be parsed.
347 * @param delim the field delimiter character.
348 * @param quote the field quoted string character.
349 */
350 public StrTokenizer(final String input, final char delim, final char quote) {
351 this(input, delim);
352 setQuoteChar(quote);
353 }
354
355 /**
356 * Constructs a tokenizer splitting on the specified delimiter string.
357 *
358 * @param input the string which is to be parsed.
359 * @param delim the field delimiter string.
360 */
361 public StrTokenizer(final String input, final String delim) {
362 this(input);
363 setDelimiterString(delim);
364 }
365
366 /**
367 * Constructs a tokenizer splitting using the specified delimiter matcher.
368 *
369 * @param input the string which is to be parsed.
370 * @param delim the field delimiter matcher.
371 */
372 public StrTokenizer(final String input, final StrMatcher delim) {
373 this(input);
374 setDelimiterMatcher(delim);
375 }
376
377 /**
378 * Constructs a tokenizer splitting using the specified delimiter matcher
379 * and handling quotes using the specified quote matcher.
380 *
381 * @param input the string which is to be parsed.
382 * @param delim the field delimiter matcher.
383 * @param quote the field quoted string matcher.
384 */
385 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
386 this(input, delim);
387 setQuoteMatcher(quote);
388 }
389
390 /**
391 * Unsupported ListIterator operation.
392 *
393 * @param obj this parameter ignored.
394 * @throws UnsupportedOperationException always.
395 */
396 @Override
397 public void add(final String obj) {
398 throw new UnsupportedOperationException("add() is unsupported");
399 }
400
401 /**
402 * Adds a token to a list, paying attention to the parameters we've set.
403 *
404 * @param list the list to add to.
405 * @param tok the token to add.
406 */
407 private void addToken(final List<String> list, String tok) {
408 if (tok == null || tok.isEmpty()) {
409 if (isIgnoreEmptyTokens()) {
410 return;
411 }
412 if (isEmptyTokenAsNull()) {
413 tok = null;
414 }
415 }
416 list.add(tok);
417 }
418
419 /**
420 * Checks if tokenization has been done, and if not then do it.
421 */
422 private void checkTokenized() {
423 if (tokens == null) {
424 if (chars == null) {
425 // still call tokenize as subclass may do some work
426 final List<String> split = tokenize(null, 0, 0);
427 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
428 } else {
429 final List<String> split = tokenize(chars, 0, chars.length);
430 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
431 }
432 }
433 }
434
435 /**
436 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a
437 * {@link CloneNotSupportedException} is caught, return {@code null}.
438 *
439 * @return a new instance of this Tokenizer which has been reset.
440 */
441 @Override
442 public Object clone() {
443 try {
444 return cloneReset();
445 } catch (final CloneNotSupportedException ex) {
446 return null;
447 }
448 }
449
450 /**
451 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list.
452 *
453 * @return a new instance of this Tokenizer which has been reset.
454 * @throws CloneNotSupportedException if there is a problem cloning.
455 */
456 Object cloneReset() throws CloneNotSupportedException {
457 // this method exists to enable 100% test coverage
458 final StrTokenizer cloned = (StrTokenizer) super.clone();
459 if (cloned.chars != null) {
460 cloned.chars = cloned.chars.clone();
461 }
462 cloned.reset();
463 return cloned;
464 }
465
466 /**
467 * Gets the String content that the tokenizer is parsing.
468 *
469 * @return The string content being parsed.
470 */
471 public String getContent() {
472 if (chars == null) {
473 return null;
474 }
475 return new String(chars);
476 }
477
478 /**
479 * Gets the field delimiter matcher.
480 *
481 * @return The delimiter matcher in use.
482 */
483 public StrMatcher getDelimiterMatcher() {
484 return this.delimMatcher;
485 }
486
487 /**
488 * Gets the ignored character matcher.
489 * <p>
490 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything.
491 * </p>
492 *
493 * @return The ignored matcher in use.
494 */
495 public StrMatcher getIgnoredMatcher() {
496 return ignoredMatcher;
497 }
498
499 /**
500 * Gets the quote matcher currently in use.
501 * <p>
502 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote).
503 * </p>
504 *
505 * @return The quote matcher in use.
506 */
507 public StrMatcher getQuoteMatcher() {
508 return quoteMatcher;
509 }
510
511 /**
512 * Gets a copy of the full token list as an independent modifiable array.
513 *
514 * @return The tokens as a String array.
515 */
516 public String[] getTokenArray() {
517 checkTokenized();
518 return tokens.clone();
519 }
520
521 /**
522 * Gets a copy of the full token list as an independent modifiable list.
523 *
524 * @return The tokens as a String array.
525 */
526 public List<String> getTokenList() {
527 checkTokenized();
528 final List<String> list = new ArrayList<>(tokens.length);
529 Collections.addAll(list, tokens);
530
531 return list;
532 }
533
534 /**
535 * Gets the trimmer character matcher.
536 * <p>
537 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything.
538 * </p>
539 *
540 * @return The trimmer matcher in use.
541 */
542 public StrMatcher getTrimmerMatcher() {
543 return trimmerMatcher;
544 }
545
546 /**
547 * Checks whether there are any more tokens.
548 *
549 * @return true if there are more tokens.
550 */
551 @Override
552 public boolean hasNext() {
553 checkTokenized();
554 return tokenPos < tokens.length;
555 }
556
557 /**
558 * Checks whether there are any previous tokens that can be iterated to.
559 *
560 * @return true if there are previous tokens.
561 */
562 @Override
563 public boolean hasPrevious() {
564 checkTokenized();
565 return tokenPos > 0;
566 }
567
568 /**
569 * Gets whether the tokenizer currently returns empty tokens as null.
570 * The default for this property is false.
571 *
572 * @return true if empty tokens are returned as null.
573 */
574 public boolean isEmptyTokenAsNull() {
575 return this.emptyAsNull;
576 }
577
578 /**
579 * Gets whether the tokenizer currently ignores empty tokens.
580 * The default for this property is true.
581 *
582 * @return true if empty tokens are not returned.
583 */
584 public boolean isIgnoreEmptyTokens() {
585 return ignoreEmptyTokens;
586 }
587
588 /**
589 * Checks if the characters at the index specified match the quote
590 * already matched in readNextToken().
591 *
592 * @param srcChars the character array being tokenized.
593 * @param pos the position to check for a quote.
594 * @param len the length of the character array being tokenized.
595 * @param quoteStart the start position of the matched quote, 0 if no quoting.
596 * @param quoteLen the length of the matched quote, 0 if no quoting.
597 * @return true if a quote is matched.
598 */
599 private boolean isQuote(final char[] srcChars,
600 final int pos,
601 final int len,
602 final int quoteStart,
603 final int quoteLen) {
604 for (int i = 0; i < quoteLen; i++) {
605 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
606 return false;
607 }
608 }
609 return true;
610 }
611
612 /**
613 * Gets the next token.
614 *
615 * @return The next String token.
616 * @throws NoSuchElementException if there are no more elements.
617 */
618 @Override
619 public String next() {
620 if (hasNext()) {
621 return tokens[tokenPos++];
622 }
623 throw new NoSuchElementException();
624 }
625
626 /**
627 * Gets the index of the next token to return.
628 *
629 * @return The next token index.
630 */
631 @Override
632 public int nextIndex() {
633 return tokenPos;
634 }
635
636 /**
637 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no
638 * tokens remain.
639 *
640 * @return The next sequential token, or null when no more tokens are found.
641 */
642 public String nextToken() {
643 if (hasNext()) {
644 return tokens[tokenPos++];
645 }
646 return null;
647 }
648
649 /**
650 * Gets the token previous to the last returned token.
651 *
652 * @return The previous token.
653 */
654 @Override
655 public String previous() {
656 if (hasPrevious()) {
657 return tokens[--tokenPos];
658 }
659 throw new NoSuchElementException();
660 }
661
662 /**
663 * Gets the index of the previous token.
664 *
665 * @return The previous token index.
666 */
667 @Override
668 public int previousIndex() {
669 return tokenPos - 1;
670 }
671
672 /**
673 * Gets the previous token from the String.
674 *
675 * @return The previous sequential token, or null when no more tokens are found.
676 */
677 public String previousToken() {
678 if (hasPrevious()) {
679 return tokens[--tokenPos];
680 }
681 return null;
682 }
683
684 /**
685 * Reads character by character through the String to get the next token.
686 *
687 * @param srcChars the character array being tokenized.
688 * @param start the first character of field.
689 * @param len the length of the character array being tokenized.
690 * @param workArea a temporary work area.
691 * @param tokenList the list of parsed tokens.
692 * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found.
693 */
694 private int readNextToken(final char[] srcChars,
695 int start,
696 final int len,
697 final StrBuilder workArea,
698 final List<String> tokenList) {
699 // skip all leading whitespace, unless it is the
700 // field delimiter or the quote character
701 while (start < len) {
702 final int removeLen = Math.max(
703 getIgnoredMatcher().isMatch(srcChars, start, start, len),
704 getTrimmerMatcher().isMatch(srcChars, start, start, len));
705 if (removeLen == 0
706 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
707 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
708 break;
709 }
710 start += removeLen;
711 }
712
713 // handle reaching end
714 if (start >= len) {
715 addToken(tokenList, StringUtils.EMPTY);
716 return -1;
717 }
718
719 // handle empty token
720 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
721 if (delimLen > 0) {
722 addToken(tokenList, StringUtils.EMPTY);
723 return start + delimLen;
724 }
725
726 // handle found token
727 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
728 if (quoteLen > 0) {
729 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
730 }
731 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
732 }
733
734 /**
735 * Reads a possibly quoted string token.
736 *
737 * @param srcChars the character array being tokenized.
738 * @param start the first character of field.
739 * @param len the length of the character array being tokenized.
740 * @param workArea a temporary work area.
741 * @param tokenList the list of parsed tokens.
742 * @param quoteStart the start position of the matched quote, 0 if no quoting.
743 * @param quoteLen the length of the matched quote, 0 if no quoting.
744 * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string.
745 */
746 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
747 final List<String> tokenList, final int quoteStart, final int quoteLen) {
748 // Loop until we've found the end of the quoted
749 // string or the end of the input
750 workArea.clear();
751 int pos = start;
752 boolean quoting = quoteLen > 0;
753 int trimStart = 0;
754
755 while (pos < len) {
756 // quoting mode can occur several times throughout a string
757 // we must switch between quoting and non-quoting until we
758 // encounter a non-quoted delimiter, or end of string
759 if (quoting) {
760 // In quoting mode
761
762 // If we've found a quote character, see if it's
763 // followed by a second quote. If so, then we need
764 // to actually put the quote character into the token
765 // rather than end the token.
766 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
767 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
768 // matched pair of quotes, thus an escaped quote
769 workArea.append(srcChars, pos, quoteLen);
770 pos += quoteLen * 2;
771 trimStart = workArea.size();
772 continue;
773 }
774
775 // end of quoting
776 quoting = false;
777 pos += quoteLen;
778 continue;
779 }
780
781 } else {
782 // Not in quoting mode
783
784 // check for delimiter, and thus end of token
785 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
786 if (delimLen > 0) {
787 // return condition when end of token found
788 addToken(tokenList, workArea.substring(0, trimStart));
789 return pos + delimLen;
790 }
791
792 // check for quote, and thus back into quoting mode
793 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
794 quoting = true;
795 pos += quoteLen;
796 continue;
797 }
798
799 // check for ignored (outside quotes), and ignore
800 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
801 if (ignoredLen > 0) {
802 pos += ignoredLen;
803 continue;
804 }
805
806 // check for trimmed character
807 // don't yet know if its at the end, so copy to workArea
808 // use trimStart to keep track of trim at the end
809 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
810 if (trimmedLen > 0) {
811 workArea.append(srcChars, pos, trimmedLen);
812 pos += trimmedLen;
813 continue;
814 }
815
816 }
817 // copy regular character from inside quotes
818 workArea.append(srcChars[pos++]);
819 trimStart = workArea.size();
820 }
821
822 // return condition when end of string found
823 addToken(tokenList, workArea.substring(0, trimStart));
824 return -1;
825 }
826
827 /**
828 * Unsupported ListIterator operation.
829 *
830 * @throws UnsupportedOperationException always.
831 */
832 @Override
833 public void remove() {
834 throw new UnsupportedOperationException("remove() is unsupported");
835 }
836
837 /**
838 * Resets this tokenizer, forgetting all parsing and iteration already completed.
839 * <p>
840 * This method allows the same tokenizer to be reused for the same String.
841 * </p>
842 *
843 * @return {@code this} instance.
844 */
845 public StrTokenizer reset() {
846 tokenPos = 0;
847 tokens = null;
848 return this;
849 }
850
851 /**
852 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
853 *
854 * @param input the new character array to tokenize, not cloned, null sets no text to parse.
855 * @return {@code this} instance.
856 */
857 public StrTokenizer reset(final char[] input) {
858 reset();
859 if (input != null) {
860 this.chars = input.clone();
861 } else {
862 this.chars = null;
863 }
864 return this;
865 }
866
867 /**
868 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
869 *
870 * @param input the new string to tokenize, null sets no text to parse.
871 * @return {@code this} instance.
872 */
873 public StrTokenizer reset(final String input) {
874 reset();
875 if (input != null) {
876 this.chars = input.toCharArray();
877 } else {
878 this.chars = null;
879 }
880 return this;
881 }
882
883 /**
884 * Unsupported ListIterator operation.
885 *
886 * @param obj this parameter ignored.
887 * @throws UnsupportedOperationException Always thrown.
888 */
889 @Override
890 public void set(final String obj) {
891 throw new UnsupportedOperationException("set() is unsupported");
892 }
893
894 /**
895 * Sets the field delimiter character.
896 *
897 * @param delim the delimiter character to use.
898 * @return {@code this} instance.
899 */
900 public StrTokenizer setDelimiterChar(final char delim) {
901 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
902 }
903
904 /**
905 * Sets the field delimiter matcher.
906 * <p>
907 * The delimiter is used to separate one token from another.
908 * </p>
909 *
910 * @param delim the delimiter matcher to use.
911 * @return {@code this} instance.
912 */
913 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
914 if (delim == null) {
915 this.delimMatcher = StrMatcher.noneMatcher();
916 } else {
917 this.delimMatcher = delim;
918 }
919 return this;
920 }
921
922 /**
923 * Sets the field delimiter string.
924 *
925 * @param delim the delimiter string to use.
926 * @return {@code this} instance.
927 */
928 public StrTokenizer setDelimiterString(final String delim) {
929 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
930 }
931
932 /**
933 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
934 *
935 * @param emptyAsNull whether empty tokens are returned as null.
936 * @return {@code this} instance.
937 */
938 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
939 this.emptyAsNull = emptyAsNull;
940 return this;
941 }
942
943 /**
944 * Sets the character to ignore.
945 * <p>
946 * This character is ignored when parsing the String, unless it is within a quoted region.
947 * </p>
948 *
949 * @param ignored the ignored character to use.
950 * @return {@code this} instance.
951 */
952 public StrTokenizer setIgnoredChar(final char ignored) {
953 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
954 }
955
956 /**
957 * Sets the matcher for characters to ignore.
958 * <p>
959 * These characters are ignored when parsing the String, unless they are within a quoted region.
960 * </p>
961 *
962 * @param ignored the ignored matcher to use, null ignored.
963 * @return {@code this} instance.
964 */
965 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
966 if (ignored != null) {
967 this.ignoredMatcher = ignored;
968 }
969 return this;
970 }
971
972 /**
973 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
974 *
975 * @param ignoreEmptyTokens whether empty tokens are not returned.
976 * @return {@code this} instance.
977 */
978 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
979 this.ignoreEmptyTokens = ignoreEmptyTokens;
980 return this;
981 }
982
983 /**
984 * Sets the quote character to use.
985 * <p>
986 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
987 * </p>
988 *
989 * @param quote the quote character to use.
990 * @return {@code this} instance.
991 */
992 public StrTokenizer setQuoteChar(final char quote) {
993 return setQuoteMatcher(StrMatcher.charMatcher(quote));
994 }
995
996 /**
997 * Sets the quote matcher to use.
998 * <p>
999 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1000 * </p>
1001 *
1002 * @param quote the quote matcher to use, null ignored.
1003 * @return {@code this} instance.
1004 */
1005 public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1006 if (quote != null) {
1007 this.quoteMatcher = quote;
1008 }
1009 return this;
1010 }
1011
1012 /**
1013 * Sets the matcher for characters to trim.
1014 * <p>
1015 * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1016 * </p>
1017 *
1018 * @param trimmer the trimmer matcher to use, null ignored
1019 * @return {@code this} instance.
1020 */
1021 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1022 if (trimmer != null) {
1023 this.trimmerMatcher = trimmer;
1024 }
1025 return this;
1026 }
1027
1028 /**
1029 * Gets the number of tokens found in the String.
1030 *
1031 * @return The number of matched tokens.
1032 */
1033 public int size() {
1034 checkTokenized();
1035 return tokens.length;
1036 }
1037
1038 /**
1039 * Internal method to performs the tokenization.
1040 * <p>
1041 * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required.
1042 * </p>
1043 * <p>
1044 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or
1045 * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results.
1046 * </p>
1047 * <p>
1048 * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other
1049 * values, or even an entirely different array.
1050 * </p>
1051 *
1052 * @param srcChars the character array being tokenized, may be null.
1053 * @param offset the start position within the character array, must be valid.
1054 * @param count the number of characters to tokenize, must be valid.
1055 * @return The modifiable list of String tokens, unmodifiable if null array or zero count.
1056 */
1057 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1058 if (srcChars == null || count == 0) {
1059 return Collections.emptyList();
1060 }
1061 final StrBuilder buf = new StrBuilder();
1062 final List<String> tokenList = new ArrayList<>();
1063 int pos = offset;
1064
1065 // loop around the entire buffer
1066 while (pos >= 0 && pos < count) {
1067 // find next token
1068 pos = readNextToken(srcChars, pos, count, buf, tokenList);
1069
1070 // handle case where end of string is a delimiter
1071 if (pos >= count) {
1072 addToken(tokenList, StringUtils.EMPTY);
1073 }
1074 }
1075 return tokenList;
1076 }
1077
1078 /**
1079 * Gets the String content that the tokenizer is parsing.
1080 *
1081 * @return The string content being parsed.
1082 */
1083 @Override
1084 public String toString() {
1085 if (tokens == null) {
1086 return "StrTokenizer[not tokenized yet]";
1087 }
1088 return "StrTokenizer" + getTokenList();
1089 }
1090
1091 }