View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text;
18  
19  import java.util.ArrayList;
20  import java.util.Collections;
21  import java.util.List;
22  import java.util.ListIterator;
23  import java.util.NoSuchElementException;
24  
25  import org.apache.commons.lang3.ArrayUtils;
26  import org.apache.commons.lang3.StringUtils;
27  
28  /**
29   * Tokenizes a string based on delimiters (separators)
30   * and supporting quoting and ignored character concepts.
31   * <p>
32   * This class can split a String into many smaller strings. It aims
33   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
34   * however it offers much more control and flexibility including implementing
35   * the {@code ListIterator} interface. By default, it is set up
36   * like {@code StringTokenizer}.
37   * <p>
38   * The input String is split into a number of <em>tokens</em>.
39   * Each token is separated from the next String by a <em>delimiter</em>.
40   * One or more delimiter characters must be specified.
41   * <p>
42   * Each token may be surrounded by quotes.
43   * The <em>quote</em> matcher specifies the quote character(s).
44   * A quote may be escaped within a quoted section by duplicating itself.
45   * <p>
46   * Between each token and the delimiter are potentially characters that need trimming.
47   * The <em>trimmer</em> matcher specifies these characters.
48   * One usage might be to trim whitespace characters.
49   * <p>
50   * At any point outside the quotes there might potentially be invalid characters.
51   * The <em>ignored</em> matcher specifies these characters to be removed.
52   * One usage might be to remove new line characters.
53   * <p>
54   * Empty tokens may be removed or returned as null.
55   * <pre>
56   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
57   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
58   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
59   * </pre>
60   *
61   * <table>
62   *  <caption>StrTokenizer properties and options</caption>
63   *  <tr>
64   *   <th>Property</th><th>Type</th><th>Default</th>
65   *  </tr>
66   *  <tr>
67   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
68   *  </tr>
69   *  <tr>
70   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
71   *  </tr>
72   *  <tr>
73   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
74   *  </tr>
75   *  <tr>
76   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
77   *  </tr>
78   *  <tr>
79   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
80   *  </tr>
81   * </table>
82   *
83   * @since 1.0
84   * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
85   */
86  @Deprecated
87  public class StrTokenizer implements ListIterator<String>, Cloneable {
88  
89      /** Comma separated values tokenizer internal variable. */
90      // @formatter:off
91      private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
92              .setDelimiterMatcher(StrMatcher.commaMatcher())
93              .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
94              .setIgnoredMatcher(StrMatcher.noneMatcher())
95              .setTrimmerMatcher(StrMatcher.trimMatcher())
96              .setEmptyTokenAsNull(false)
97              .setIgnoreEmptyTokens(false);
98      // @formatter:on
99  
100     /** Tab separated values tokenizer internal variable. */
101     // @formatter:off
102     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
103             .setDelimiterMatcher(StrMatcher.tabMatcher())
104             .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
105             .setIgnoredMatcher(StrMatcher.noneMatcher())
106             .setTrimmerMatcher(StrMatcher.trimMatcher())
107             .setEmptyTokenAsNull(false)
108             .setIgnoreEmptyTokens(false);
109     // @formatter:on
110 
111     /**
112      * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
113      *
114      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
115      */
116     private static StrTokenizer getCSVClone() {
117         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
118     }
119 
120     /**
121      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
122      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
123      * <p>
124      * You must call a "reset" method to set the string which you want to parse.
125      * </p>
126      *
127      * @return a new tokenizer instance which parses Comma Separated Value strings.
128      */
129     public static StrTokenizer getCSVInstance() {
130         return getCSVClone();
131     }
132 
133     /**
134      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
135      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
136      *
137      * @param input the text to parse.
138      * @return a new tokenizer instance which parses Comma Separated Value strings.
139      */
140     public static StrTokenizer getCSVInstance(final char[] input) {
141         final StrTokenizer tok = getCSVClone();
142         tok.reset(input);
143         return tok;
144     }
145 
146     /**
147      * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
148      * trim whitespace from both ends (which can be overridden with the setTrimmer method).
149      *
150      * @param input the text to parse.
151      * @return a new tokenizer instance which parses Comma Separated Value strings.
152      */
153     public static StrTokenizer getCSVInstance(final String input) {
154         final StrTokenizer tok = getCSVClone();
155         tok.reset(input);
156         return tok;
157     }
158 
159     /**
160      * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
161      *
162      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
163      */
164     private static StrTokenizer getTSVClone() {
165         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
166     }
167 
168     /**
169      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
170      * be overridden with the setTrimmer method).
171      * <p>
172      * You must call a "reset" method to set the string which you want to parse.
173      * </p>
174      *
175      * @return a new tokenizer instance which parses Tab Separated Value strings.
176      */
177     public static StrTokenizer getTSVInstance() {
178         return getTSVClone();
179     }
180 
181     /**
182      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
183      * be overridden with the setTrimmer method).
184      *
185      * @param input the string to parse.
186      * @return a new tokenizer instance which parses Tab Separated Value strings.
187      */
188     public static StrTokenizer getTSVInstance(final char[] input) {
189         final StrTokenizer tok = getTSVClone();
190         tok.reset(input);
191         return tok;
192     }
193 
194     /**
195      * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
196      * be overridden with the setTrimmer method).
197      *
198      * @param input the string to parse.
199      * @return a new tokenizer instance which parses Tab Separated Value strings.
200      */
201     public static StrTokenizer getTSVInstance(final String input) {
202         final StrTokenizer tok = getTSVClone();
203         tok.reset(input);
204         return tok;
205     }
206 
207     /** The text to work on. */
208     private char[] chars;
209 
210     /** The parsed tokens. */
211     private String[] tokens;
212 
213     /** The current iteration position. */
214     private int tokenPos;
215 
216     /** The delimiter matcher. */
217     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
218 
219     /** The quote matcher. */
220     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
221 
222     /** The ignored matcher. */
223     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
224 
225     /** The trimmer matcher. */
226     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
227 
228     /** Whether to return empty tokens as null. */
229     private boolean emptyAsNull;
230 
231     /** Whether to ignore empty tokens. */
232     private boolean ignoreEmptyTokens = true;
233 
234     /**
235      * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize.
236      * <p>
237      * This constructor is normally used with {@link #reset(String)}.
238      * </p>
239      */
240     public StrTokenizer() {
241         this.chars = null;
242     }
243 
244     /**
245      * Constructs a tokenizer splitting on space, tab, newline and form feed
246      * as per StringTokenizer.
247      *
248      * @param input  the string which is to be parsed, not cloned.
249      */
250     public StrTokenizer(final char[] input) {
251         if (input == null) {
252             this.chars = null;
253         } else {
254             this.chars = input.clone();
255         }
256     }
257 
258     /**
259      * Constructs a tokenizer splitting on the specified character.
260      *
261      * @param input  the string which is to be parsed, not cloned.
262      * @param delim the field delimiter character.
263      */
264     public StrTokenizer(final char[] input, final char delim) {
265         this(input);
266         setDelimiterChar(delim);
267     }
268 
269     /**
270      * Constructs a tokenizer splitting on the specified delimiter character
271      * and handling quotes using the specified quote character.
272      *
273      * @param input  the string which is to be parsed, not cloned.
274      * @param delim  the field delimiter character.
275      * @param quote  the field quoted string character.
276      */
277     public StrTokenizer(final char[] input, final char delim, final char quote) {
278         this(input, delim);
279         setQuoteChar(quote);
280     }
281 
282     /**
283      * Constructs a tokenizer splitting on the specified string.
284      *
285      * @param input  the string which is to be parsed, not cloned.
286      * @param delim the field delimiter string.
287      */
288     public StrTokenizer(final char[] input, final String delim) {
289         this(input);
290         setDelimiterString(delim);
291     }
292 
293     /**
294      * Constructs a tokenizer splitting using the specified delimiter matcher.
295      *
296      * @param input  the string which is to be parsed, not cloned.
297      * @param delim  the field delimiter matcher.
298      */
299     public StrTokenizer(final char[] input, final StrMatcher delim) {
300         this(input);
301         setDelimiterMatcher(delim);
302     }
303 
304     /**
305      * Constructs a tokenizer splitting using the specified delimiter matcher
306      * and handling quotes using the specified quote matcher.
307      *
308      * @param input  the string which is to be parsed, not cloned.
309      * @param delim  the field delimiter character.
310      * @param quote  the field quoted string character.
311      */
312     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
313         this(input, delim);
314         setQuoteMatcher(quote);
315     }
316 
317     /**
318      * Constructs a tokenizer splitting on space, tab, newline and form feed
319      * as per StringTokenizer.
320      *
321      * @param input  the string which is to be parsed.
322      */
323     public StrTokenizer(final String input) {
324         if (input != null) {
325             chars = input.toCharArray();
326         } else {
327             chars = null;
328         }
329     }
330 
331     /**
332      * Constructs a tokenizer splitting on the specified delimiter character.
333      *
334      * @param input  the string which is to be parsed.
335      * @param delim  the field delimiter character.
336      */
337     public StrTokenizer(final String input, final char delim) {
338         this(input);
339         setDelimiterChar(delim);
340     }
341 
342     /**
343      * Constructs a tokenizer splitting on the specified delimiter character
344      * and handling quotes using the specified quote character.
345      *
346      * @param input  the string which is to be parsed.
347      * @param delim  the field delimiter character.
348      * @param quote  the field quoted string character.
349      */
350     public StrTokenizer(final String input, final char delim, final char quote) {
351         this(input, delim);
352         setQuoteChar(quote);
353     }
354 
355     /**
356      * Constructs a tokenizer splitting on the specified delimiter string.
357      *
358      * @param input  the string which is to be parsed.
359      * @param delim  the field delimiter string.
360      */
361     public StrTokenizer(final String input, final String delim) {
362         this(input);
363         setDelimiterString(delim);
364     }
365 
366     /**
367      * Constructs a tokenizer splitting using the specified delimiter matcher.
368      *
369      * @param input  the string which is to be parsed.
370      * @param delim  the field delimiter matcher.
371      */
372     public StrTokenizer(final String input, final StrMatcher delim) {
373         this(input);
374         setDelimiterMatcher(delim);
375     }
376 
377     /**
378      * Constructs a tokenizer splitting using the specified delimiter matcher
379      * and handling quotes using the specified quote matcher.
380      *
381      * @param input  the string which is to be parsed.
382      * @param delim  the field delimiter matcher.
383      * @param quote  the field quoted string matcher.
384      */
385     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
386         this(input, delim);
387         setQuoteMatcher(quote);
388     }
389 
390     /**
391      * Unsupported ListIterator operation.
392      *
393      * @param obj this parameter ignored.
394      * @throws UnsupportedOperationException always.
395      */
396     @Override
397     public void add(final String obj) {
398         throw new UnsupportedOperationException("add() is unsupported");
399     }
400 
401     /**
402      * Adds a token to a list, paying attention to the parameters we've set.
403      *
404      * @param list  the list to add to.
405      * @param tok  the token to add.
406      */
407     private void addToken(final List<String> list, String tok) {
408         if (tok == null || tok.isEmpty()) {
409             if (isIgnoreEmptyTokens()) {
410                 return;
411             }
412             if (isEmptyTokenAsNull()) {
413                 tok = null;
414             }
415         }
416         list.add(tok);
417     }
418 
419     /**
420      * Checks if tokenization has been done, and if not then do it.
421      */
422     private void checkTokenized() {
423         if (tokens == null) {
424             if (chars == null) {
425                 // still call tokenize as subclass may do some work
426                 final List<String> split = tokenize(null, 0, 0);
427                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
428             } else {
429                 final List<String> split = tokenize(chars, 0, chars.length);
430                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
431             }
432         }
433     }
434 
435     /**
436      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a
437      * {@link CloneNotSupportedException} is caught, return {@code null}.
438      *
439      * @return a new instance of this Tokenizer which has been reset.
440      */
441     @Override
442     public Object clone() {
443         try {
444             return cloneReset();
445         } catch (final CloneNotSupportedException ex) {
446             return null;
447         }
448     }
449 
450     /**
451      * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list.
452      *
453      * @return a new instance of this Tokenizer which has been reset.
454      * @throws CloneNotSupportedException if there is a problem cloning.
455      */
456     Object cloneReset() throws CloneNotSupportedException {
457         // this method exists to enable 100% test coverage
458         final StrTokenizer cloned = (StrTokenizer) super.clone();
459         if (cloned.chars != null) {
460             cloned.chars = cloned.chars.clone();
461         }
462         cloned.reset();
463         return cloned;
464     }
465 
466     /**
467      * Gets the String content that the tokenizer is parsing.
468      *
469      * @return The string content being parsed.
470      */
471     public String getContent() {
472         if (chars == null) {
473             return null;
474         }
475         return new String(chars);
476     }
477 
478     /**
479      * Gets the field delimiter matcher.
480      *
481      * @return The delimiter matcher in use.
482      */
483     public StrMatcher getDelimiterMatcher() {
484         return this.delimMatcher;
485     }
486 
487     /**
488      * Gets the ignored character matcher.
489      * <p>
490      * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything.
491      * </p>
492      *
493      * @return The ignored matcher in use.
494      */
495     public StrMatcher getIgnoredMatcher() {
496         return ignoredMatcher;
497     }
498 
499     /**
500      * Gets the quote matcher currently in use.
501      * <p>
502      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote).
503      * </p>
504      *
505      * @return The quote matcher in use.
506      */
507     public StrMatcher getQuoteMatcher() {
508         return quoteMatcher;
509     }
510 
511     /**
512      * Gets a copy of the full token list as an independent modifiable array.
513      *
514      * @return The tokens as a String array.
515      */
516     public String[] getTokenArray() {
517         checkTokenized();
518         return tokens.clone();
519     }
520 
521     /**
522      * Gets a copy of the full token list as an independent modifiable list.
523      *
524      * @return The tokens as a String array.
525      */
526     public List<String> getTokenList() {
527         checkTokenized();
528         final List<String> list = new ArrayList<>(tokens.length);
529         Collections.addAll(list, tokens);
530 
531         return list;
532     }
533 
534     /**
535      * Gets the trimmer character matcher.
536      * <p>
537      * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything.
538      * </p>
539      *
540      * @return The trimmer matcher in use.
541      */
542     public StrMatcher getTrimmerMatcher() {
543         return trimmerMatcher;
544     }
545 
546     /**
547      * Checks whether there are any more tokens.
548      *
549      * @return true if there are more tokens.
550      */
551     @Override
552     public boolean hasNext() {
553         checkTokenized();
554         return tokenPos < tokens.length;
555     }
556 
557     /**
558      * Checks whether there are any previous tokens that can be iterated to.
559      *
560      * @return true if there are previous tokens.
561      */
562     @Override
563     public boolean hasPrevious() {
564         checkTokenized();
565         return tokenPos > 0;
566     }
567 
568     /**
569      * Gets whether the tokenizer currently returns empty tokens as null.
570      * The default for this property is false.
571      *
572      * @return true if empty tokens are returned as null.
573      */
574     public boolean isEmptyTokenAsNull() {
575         return this.emptyAsNull;
576     }
577 
578     /**
579      * Gets whether the tokenizer currently ignores empty tokens.
580      * The default for this property is true.
581      *
582      * @return true if empty tokens are not returned.
583      */
584     public boolean isIgnoreEmptyTokens() {
585         return ignoreEmptyTokens;
586     }
587 
588     /**
589      * Checks if the characters at the index specified match the quote
590      * already matched in readNextToken().
591      *
592      * @param srcChars  the character array being tokenized.
593      * @param pos  the position to check for a quote.
594      * @param len  the length of the character array being tokenized.
595      * @param quoteStart  the start position of the matched quote, 0 if no quoting.
596      * @param quoteLen  the length of the matched quote, 0 if no quoting.
597      * @return true if a quote is matched.
598      */
599     private boolean isQuote(final char[] srcChars,
600                             final int pos,
601                             final int len,
602                             final int quoteStart,
603                             final int quoteLen) {
604         for (int i = 0; i < quoteLen; i++) {
605             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
606                 return false;
607             }
608         }
609         return true;
610     }
611 
612     /**
613      * Gets the next token.
614      *
615      * @return The next String token.
616      * @throws NoSuchElementException if there are no more elements.
617      */
618     @Override
619     public String next() {
620         if (hasNext()) {
621             return tokens[tokenPos++];
622         }
623         throw new NoSuchElementException();
624     }
625 
626     /**
627      * Gets the index of the next token to return.
628      *
629      * @return The next token index.
630      */
631     @Override
632     public int nextIndex() {
633         return tokenPos;
634     }
635 
636     /**
637      * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no
638      * tokens remain.
639      *
640      * @return The next sequential token, or null when no more tokens are found.
641      */
642     public String nextToken() {
643         if (hasNext()) {
644             return tokens[tokenPos++];
645         }
646         return null;
647     }
648 
649     /**
650      * Gets the token previous to the last returned token.
651      *
652      * @return The previous token.
653      */
654     @Override
655     public String previous() {
656         if (hasPrevious()) {
657             return tokens[--tokenPos];
658         }
659         throw new NoSuchElementException();
660     }
661 
662     /**
663      * Gets the index of the previous token.
664      *
665      * @return The previous token index.
666      */
667     @Override
668     public int previousIndex() {
669         return tokenPos - 1;
670     }
671 
672     /**
673      * Gets the previous token from the String.
674      *
675      * @return The previous sequential token, or null when no more tokens are found.
676      */
677     public String previousToken() {
678         if (hasPrevious()) {
679             return tokens[--tokenPos];
680         }
681         return null;
682     }
683 
684     /**
685      * Reads character by character through the String to get the next token.
686      *
687      * @param srcChars  the character array being tokenized.
688      * @param start     the first character of field.
689      * @param len       the length of the character array being tokenized.
690      * @param workArea  a temporary work area.
691      * @param tokenList the list of parsed tokens.
692      * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found.
693      */
694     private int readNextToken(final char[] srcChars,
695                               int start,
696                               final int len,
697                               final StrBuilder workArea,
698                               final List<String> tokenList) {
699         // skip all leading whitespace, unless it is the
700         // field delimiter or the quote character
701         while (start < len) {
702             final int removeLen = Math.max(
703                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
704                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
705             if (removeLen == 0
706                     || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
707                     || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
708                 break;
709             }
710             start += removeLen;
711         }
712 
713         // handle reaching end
714         if (start >= len) {
715             addToken(tokenList, StringUtils.EMPTY);
716             return -1;
717         }
718 
719         // handle empty token
720         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
721         if (delimLen > 0) {
722             addToken(tokenList, StringUtils.EMPTY);
723             return start + delimLen;
724         }
725 
726         // handle found token
727         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
728         if (quoteLen > 0) {
729             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
730         }
731         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
732     }
733 
734     /**
735      * Reads a possibly quoted string token.
736      *
737      * @param srcChars   the character array being tokenized.
738      * @param start      the first character of field.
739      * @param len        the length of the character array being tokenized.
740      * @param workArea   a temporary work area.
741      * @param tokenList  the list of parsed tokens.
742      * @param quoteStart the start position of the matched quote, 0 if no quoting.
743      * @param quoteLen   the length of the matched quote, 0 if no quoting.
744      * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string.
745      */
746     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
747                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
748         // Loop until we've found the end of the quoted
749         // string or the end of the input
750         workArea.clear();
751         int pos = start;
752         boolean quoting = quoteLen > 0;
753         int trimStart = 0;
754 
755         while (pos < len) {
756             // quoting mode can occur several times throughout a string
757             // we must switch between quoting and non-quoting until we
758             // encounter a non-quoted delimiter, or end of string
759             if (quoting) {
760                 // In quoting mode
761 
762                 // If we've found a quote character, see if it's
763                 // followed by a second quote.  If so, then we need
764                 // to actually put the quote character into the token
765                 // rather than end the token.
766                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
767                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
768                         // matched pair of quotes, thus an escaped quote
769                         workArea.append(srcChars, pos, quoteLen);
770                         pos += quoteLen * 2;
771                         trimStart = workArea.size();
772                         continue;
773                     }
774 
775                     // end of quoting
776                     quoting = false;
777                     pos += quoteLen;
778                     continue;
779                 }
780 
781             } else {
782                 // Not in quoting mode
783 
784                 // check for delimiter, and thus end of token
785                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
786                 if (delimLen > 0) {
787                     // return condition when end of token found
788                     addToken(tokenList, workArea.substring(0, trimStart));
789                     return pos + delimLen;
790                 }
791 
792                 // check for quote, and thus back into quoting mode
793                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
794                     quoting = true;
795                     pos += quoteLen;
796                     continue;
797                 }
798 
799                 // check for ignored (outside quotes), and ignore
800                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
801                 if (ignoredLen > 0) {
802                     pos += ignoredLen;
803                     continue;
804                 }
805 
806                 // check for trimmed character
807                 // don't yet know if its at the end, so copy to workArea
808                 // use trimStart to keep track of trim at the end
809                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
810                 if (trimmedLen > 0) {
811                     workArea.append(srcChars, pos, trimmedLen);
812                     pos += trimmedLen;
813                     continue;
814                 }
815 
816             }
817             // copy regular character from inside quotes
818             workArea.append(srcChars[pos++]);
819             trimStart = workArea.size();
820         }
821 
822         // return condition when end of string found
823         addToken(tokenList, workArea.substring(0, trimStart));
824         return -1;
825     }
826 
827     /**
828      * Unsupported ListIterator operation.
829      *
830      * @throws UnsupportedOperationException always.
831      */
832     @Override
833     public void remove() {
834         throw new UnsupportedOperationException("remove() is unsupported");
835     }
836 
837     /**
838      * Resets this tokenizer, forgetting all parsing and iteration already completed.
839      * <p>
840      * This method allows the same tokenizer to be reused for the same String.
841      * </p>
842      *
843      * @return {@code this} instance.
844      */
845     public StrTokenizer reset() {
846         tokenPos = 0;
847         tokens = null;
848         return this;
849     }
850 
851     /**
852      * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
853      *
854      * @param input the new character array to tokenize, not cloned, null sets no text to parse.
855      * @return {@code this} instance.
856      */
857     public StrTokenizer reset(final char[] input) {
858         reset();
859         if (input != null) {
860             this.chars = input.clone();
861         } else {
862             this.chars = null;
863         }
864         return this;
865     }
866 
867     /**
868      * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
869      *
870      * @param input the new string to tokenize, null sets no text to parse.
871      * @return {@code this} instance.
872      */
873     public StrTokenizer reset(final String input) {
874         reset();
875         if (input != null) {
876             this.chars = input.toCharArray();
877         } else {
878             this.chars = null;
879         }
880         return this;
881     }
882 
883     /**
884      * Unsupported ListIterator operation.
885      *
886      * @param obj this parameter ignored.
887      * @throws UnsupportedOperationException Always thrown.
888      */
889     @Override
890     public void set(final String obj) {
891         throw new UnsupportedOperationException("set() is unsupported");
892     }
893 
894     /**
895      * Sets the field delimiter character.
896      *
897      * @param delim  the delimiter character to use.
898      * @return {@code this} instance.
899      */
900     public StrTokenizer setDelimiterChar(final char delim) {
901         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
902     }
903 
904     /**
905      * Sets the field delimiter matcher.
906      * <p>
907      * The delimiter is used to separate one token from another.
908      * </p>
909      *
910      * @param delim  the delimiter matcher to use.
911      * @return {@code this} instance.
912      */
913     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
914         if (delim == null) {
915             this.delimMatcher = StrMatcher.noneMatcher();
916         } else {
917             this.delimMatcher = delim;
918         }
919         return this;
920     }
921 
922     /**
923      * Sets the field delimiter string.
924      *
925      * @param delim  the delimiter string to use.
926      * @return {@code this} instance.
927      */
928     public StrTokenizer setDelimiterString(final String delim) {
929         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
930     }
931 
932     /**
933      * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
934      *
935      * @param emptyAsNull whether empty tokens are returned as null.
936      * @return {@code this} instance.
937      */
938     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
939         this.emptyAsNull = emptyAsNull;
940         return this;
941     }
942 
943     /**
944      * Sets the character to ignore.
945      * <p>
946      * This character is ignored when parsing the String, unless it is within a quoted region.
947      * </p>
948      *
949      * @param ignored the ignored character to use.
950      * @return {@code this} instance.
951      */
952     public StrTokenizer setIgnoredChar(final char ignored) {
953         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
954     }
955 
956     /**
957      * Sets the matcher for characters to ignore.
958      * <p>
959      * These characters are ignored when parsing the String, unless they are within a quoted region.
960      * </p>
961      *
962      * @param ignored the ignored matcher to use, null ignored.
963      * @return {@code this} instance.
964      */
965     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
966         if (ignored != null) {
967             this.ignoredMatcher = ignored;
968         }
969         return this;
970     }
971 
972     /**
973      * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
974      *
975      * @param ignoreEmptyTokens whether empty tokens are not returned.
976      * @return {@code this} instance.
977      */
978     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
979         this.ignoreEmptyTokens = ignoreEmptyTokens;
980         return this;
981     }
982 
983     /**
984      * Sets the quote character to use.
985      * <p>
986      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
987      * </p>
988      *
989      * @param quote the quote character to use.
990      * @return {@code this} instance.
991      */
992     public StrTokenizer setQuoteChar(final char quote) {
993         return setQuoteMatcher(StrMatcher.charMatcher(quote));
994     }
995 
996     /**
997      * Sets the quote matcher to use.
998      * <p>
999      * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1000      * </p>
1001      *
1002      * @param quote the quote matcher to use, null ignored.
1003      * @return {@code this} instance.
1004      */
1005     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1006         if (quote != null) {
1007             this.quoteMatcher = quote;
1008         }
1009         return this;
1010     }
1011 
1012     /**
1013      * Sets the matcher for characters to trim.
1014      * <p>
1015      * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1016      * </p>
1017      *
1018      * @param trimmer the trimmer matcher to use, null ignored
1019      * @return {@code this} instance.
1020      */
1021     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1022         if (trimmer != null) {
1023             this.trimmerMatcher = trimmer;
1024         }
1025         return this;
1026     }
1027 
1028     /**
1029      * Gets the number of tokens found in the String.
1030      *
1031      * @return The number of matched tokens.
1032      */
1033     public int size() {
1034         checkTokenized();
1035         return tokens.length;
1036     }
1037 
1038     /**
1039      * Internal method to performs the tokenization.
1040      * <p>
1041      * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required.
1042      * </p>
1043      * <p>
1044      * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or
1045      * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results.
1046      * </p>
1047      * <p>
1048      * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other
1049      * values, or even an entirely different array.
1050      * </p>
1051      *
1052      * @param srcChars the character array being tokenized, may be null.
1053      * @param offset   the start position within the character array, must be valid.
1054      * @param count    the number of characters to tokenize, must be valid.
1055      * @return The modifiable list of String tokens, unmodifiable if null array or zero count.
1056      */
1057     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1058         if (srcChars == null || count == 0) {
1059             return Collections.emptyList();
1060         }
1061         final StrBuilder buf = new StrBuilder();
1062         final List<String> tokenList = new ArrayList<>();
1063         int pos = offset;
1064 
1065         // loop around the entire buffer
1066         while (pos >= 0 && pos < count) {
1067             // find next token
1068             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1069 
1070             // handle case where end of string is a delimiter
1071             if (pos >= count) {
1072                 addToken(tokenList, StringUtils.EMPTY);
1073             }
1074         }
1075         return tokenList;
1076     }
1077 
1078     /**
1079      * Gets the String content that the tokenizer is parsing.
1080      *
1081      * @return The string content being parsed.
1082      */
1083     @Override
1084     public String toString() {
1085         if (tokens == null) {
1086             return "StrTokenizer[not tokenized yet]";
1087         }
1088         return "StrTokenizer" + getTokenList();
1089     }
1090 
1091 }