View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.lang3.text;
18  
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.List;
23  import java.util.ListIterator;
24  import java.util.NoSuchElementException;
25  import java.util.StringTokenizer;
26  
27  import org.apache.commons.lang3.ArrayUtils;
28  import org.apache.commons.lang3.StringUtils;
29  
30  /**
31   * Tokenizes a string based on delimiters (separators)
32   * and supporting quoting and ignored character concepts.
33   * <p>
34   * This class can split a String into many smaller strings. It aims
35   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
36   * however it offers much more control and flexibility including implementing
37   * the {@link ListIterator} interface. By default, it is set up
38   * like {@link StringTokenizer}.
39   * </p>
40   * <p>
41   * The input String is split into a number of <em>tokens</em>.
42   * Each token is separated from the next String by a <em>delimiter</em>.
43   * One or more delimiter characters must be specified.
44   * </p>
45   * <p>
46   * Each token may be surrounded by quotes.
47   * The <em>quote</em> matcher specifies the quote character(s).
48   * A quote may be escaped within a quoted section by duplicating itself.
49   * </p>
50   * <p>
51   * Between each token and the delimiter are potentially characters that need trimming.
52   * The <em>trimmer</em> matcher specifies these characters.
53   * One usage might be to trim whitespace characters.
54   * </p>
55   * <p>
56   * At any point outside the quotes there might potentially be invalid characters.
57   * The <em>ignored</em> matcher specifies these characters to be removed.
58   * One usage might be to remove new line characters.
59   * </p>
60   * <p>
61   * Empty tokens may be removed or returned as null.
62   * </p>
63   * <pre>
64   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
65   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
66   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
67   * </pre>
68   *
69   * <table>
70   *  <caption>StrTokenizer properties and options</caption>
71   *  <tr>
72   *   <th>Property</th><th>Type</th><th>Default</th>
73   *  </tr>
74   *  <tr>
75   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
76   *  </tr>
77   *  <tr>
78   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
79   *  </tr>
80   *  <tr>
81   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
82   *  </tr>
83   *  <tr>
84   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
85   *  </tr>
86   *  <tr>
87   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
88   *  </tr>
89   * </table>
90   *
91   * @since 2.2
92   * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text
93   * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
94   * StringTokenizer</a>.
95   */
96  @Deprecated
97  public class StrTokenizer implements ListIterator<String>, Cloneable {
98  
99      // @formatter:off
100     private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
101             .setDelimiterMatcher(StrMatcher.commaMatcher())
102             .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
103             .setIgnoredMatcher(StrMatcher.noneMatcher())
104             .setTrimmerMatcher(StrMatcher.trimMatcher())
105             .setEmptyTokenAsNull(false)
106             .setIgnoreEmptyTokens(false);
107     // @formatter:on
108 
109     // @formatter:off
110     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
111             .setDelimiterMatcher(StrMatcher.tabMatcher())
112             .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
113             .setIgnoredMatcher(StrMatcher.noneMatcher())
114             .setTrimmerMatcher(StrMatcher.trimMatcher())
115             .setEmptyTokenAsNull(false)
116             .setIgnoreEmptyTokens(false);
117     // @formatter:on
118 
119     /**
120      * Gets a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121      *
122      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123      */
124     private static StrTokenizer getCSVClone() {
125         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126     }
127 
128     /**
129      * Gets a new tokenizer instance which parses Comma Separated Value strings
130      * initializing it with the given input.  The default for CSV processing
131      * will be trim whitespace from both ends (which can be overridden with
132      * the setTrimmer method).
133      * <p>
134      * You must call a "reset" method to set the string which you want to parse.
135      * </p>
136      *
137      * @return a new tokenizer instance which parses Comma Separated Value strings.
138      */
139     public static StrTokenizer getCSVInstance() {
140         return getCSVClone();
141     }
142 
143     /**
144      * Gets a new tokenizer instance which parses Comma Separated Value strings
145      * initializing it with the given input.  The default for CSV processing
146      * will be trim whitespace from both ends (which can be overridden with
147      * the setTrimmer method).
148      *
149      * @param input  the text to parse.
150      * @return a new tokenizer instance which parses Comma Separated Value strings.
151      */
152     public static StrTokenizer getCSVInstance(final char[] input) {
153         final StrTokenizer tok = getCSVClone();
154         tok.reset(input);
155         return tok;
156     }
157 
158     /**
159      * Gets a new tokenizer instance which parses Comma Separated Value strings
160      * initializing it with the given input.  The default for CSV processing
161      * will be trim whitespace from both ends (which can be overridden with
162      * the setTrimmer method).
163      *
164      * @param input  the text to parse.
165      * @return a new tokenizer instance which parses Comma Separated Value strings.
166      */
167     public static StrTokenizer getCSVInstance(final String input) {
168         final StrTokenizer tok = getCSVClone();
169         tok.reset(input);
170         return tok;
171     }
172 
173     /**
174      * Gets a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
175      *
176      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
177      */
178     private static StrTokenizer getTSVClone() {
179         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
180     }
181 
182     /**
183      * Gets a new tokenizer instance which parses Tab Separated Value strings.
184      * The default for CSV processing will be trim whitespace from both ends
185      * (which can be overridden with the setTrimmer method).
186      * <p>
187      * You must call a "reset" method to set the string which you want to parse.
188      * </p>
189      *
190      * @return a new tokenizer instance which parses Tab Separated Value strings.
191      */
192     public static StrTokenizer getTSVInstance() {
193         return getTSVClone();
194     }
195 
196     /**
197      * Gets a new tokenizer instance which parses Tab Separated Value strings.
198      * The default for CSV processing will be trim whitespace from both ends
199      * (which can be overridden with the setTrimmer method).
200      *
201      * @param input  the string to parse.
202      * @return a new tokenizer instance which parses Tab Separated Value strings.
203      */
204     public static StrTokenizer getTSVInstance(final char[] input) {
205         final StrTokenizer tok = getTSVClone();
206         tok.reset(input);
207         return tok;
208     }
209 
210     /**
211      * Gets a new tokenizer instance which parses Tab Separated Value strings.
212      * The default for CSV processing will be trim whitespace from both ends
213      * (which can be overridden with the setTrimmer method).
214      *
215      * @param input  the string to parse.
216      * @return a new tokenizer instance which parses Tab Separated Value strings.
217      */
218     public static StrTokenizer getTSVInstance(final String input) {
219         final StrTokenizer tok = getTSVClone();
220         tok.reset(input);
221         return tok;
222     }
223 
224     /** The text to work on. */
225     private char[] chars;
226 
227     /** The parsed tokens */
228     private String[] tokens;
229 
230     /** The current iteration position */
231     private int tokenPos;
232 
233     /** The delimiter matcher */
234     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
235 
236     /** The quote matcher */
237     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
238 
239     /** The ignored matcher */
240     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
241 
242     /** The trimmer matcher */
243     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
244 
245     /** Whether to return empty tokens as null */
246     private boolean emptyAsNull;
247 
248     /** Whether to ignore empty tokens */
249     private boolean ignoreEmptyTokens = true;
250 
251     /**
252      * Constructs a tokenizer splitting on space, tab, newline and formfeed
253      * as per StringTokenizer, but with no text to tokenize.
254      * <p>
255      * This constructor is normally used with {@link #reset(String)}.
256      * </p>
257      */
258     public StrTokenizer() {
259         this.chars = null;
260     }
261 
262     /**
263      * Constructs a tokenizer splitting on space, tab, newline and formfeed
264      * as per StringTokenizer.
265      *
266      * @param input  the string which is to be parsed, not cloned.
267      */
268     public StrTokenizer(final char[] input) {
269         this.chars = ArrayUtils.clone(input);
270     }
271 
272     /**
273      * Constructs a tokenizer splitting on the specified character.
274      *
275      * @param input  the string which is to be parsed, not cloned.
276      * @param delim the field delimiter character.
277      */
278     public StrTokenizer(final char[] input, final char delim) {
279         this(input);
280         setDelimiterChar(delim);
281     }
282 
283     /**
284      * Constructs a tokenizer splitting on the specified delimiter character
285      * and handling quotes using the specified quote character.
286      *
287      * @param input  the string which is to be parsed, not cloned.
288      * @param delim  the field delimiter character.
289      * @param quote  the field quoted string character.
290      */
291     public StrTokenizer(final char[] input, final char delim, final char quote) {
292         this(input, delim);
293         setQuoteChar(quote);
294     }
295 
296     /**
297      * Constructs a tokenizer splitting on the specified string.
298      *
299      * @param input  the string which is to be parsed, not cloned.
300      * @param delim the field delimiter string.
301      */
302     public StrTokenizer(final char[] input, final String delim) {
303         this(input);
304         setDelimiterString(delim);
305     }
306 
307     /**
308      * Constructs a tokenizer splitting using the specified delimiter matcher.
309      *
310      * @param input  the string which is to be parsed, not cloned.
311      * @param delim  the field delimiter matcher.
312      */
313     public StrTokenizer(final char[] input, final StrMatcher delim) {
314         this(input);
315         setDelimiterMatcher(delim);
316     }
317 
318     /**
319      * Constructs a tokenizer splitting using the specified delimiter matcher
320      * and handling quotes using the specified quote matcher.
321      *
322      * @param input  the string which is to be parsed, not cloned.
323      * @param delim  the field delimiter character.
324      * @param quote  the field quoted string character.
325      */
326     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
327         this(input, delim);
328         setQuoteMatcher(quote);
329     }
330 
331     /**
332      * Constructs a tokenizer splitting on space, tab, newline and formfeed
333      * as per StringTokenizer.
334      *
335      * @param input  the string which is to be parsed.
336      */
337     public StrTokenizer(final String input) {
338         if (input != null) {
339             chars = input.toCharArray();
340         } else {
341             chars = null;
342         }
343     }
344 
345     /**
346      * Constructs a tokenizer splitting on the specified delimiter character.
347      *
348      * @param input  the string which is to be parsed.
349      * @param delim  the field delimiter character.
350      */
351     public StrTokenizer(final String input, final char delim) {
352         this(input);
353         setDelimiterChar(delim);
354     }
355 
356     /**
357      * Constructs a tokenizer splitting on the specified delimiter character
358      * and handling quotes using the specified quote character.
359      *
360      * @param input  the string which is to be parsed.
361      * @param delim  the field delimiter character.
362      * @param quote  the field quoted string character.
363      */
364     public StrTokenizer(final String input, final char delim, final char quote) {
365         this(input, delim);
366         setQuoteChar(quote);
367     }
368 
369     /**
370      * Constructs a tokenizer splitting on the specified delimiter string.
371      *
372      * @param input  the string which is to be parsed.
373      * @param delim  the field delimiter string.
374      */
375     public StrTokenizer(final String input, final String delim) {
376         this(input);
377         setDelimiterString(delim);
378     }
379 
380     /**
381      * Constructs a tokenizer splitting using the specified delimiter matcher.
382      *
383      * @param input  the string which is to be parsed.
384      * @param delim  the field delimiter matcher.
385      */
386     public StrTokenizer(final String input, final StrMatcher delim) {
387         this(input);
388         setDelimiterMatcher(delim);
389     }
390 
391     /**
392      * Constructs a tokenizer splitting using the specified delimiter matcher
393      * and handling quotes using the specified quote matcher.
394      *
395      * @param input  the string which is to be parsed.
396      * @param delim  the field delimiter matcher.
397      * @param quote  the field quoted string matcher.
398      */
399     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
400         this(input, delim);
401         setQuoteMatcher(quote);
402     }
403 
404     /**
405      * Unsupported ListIterator operation.
406      *
407      * @param obj this parameter ignored.
408      * @throws UnsupportedOperationException always.
409      */
410     @Override
411     public void add(final String obj) {
412         throw new UnsupportedOperationException("add() is unsupported");
413     }
414 
415     /**
416      * Adds a token to a list, paying attention to the parameters we've set.
417      *
418      * @param list  the list to add to.
419      * @param tok  the token to add.
420      */
421     private void addToken(final List<String> list, String tok) {
422         if (StringUtils.isEmpty(tok)) {
423             if (isIgnoreEmptyTokens()) {
424                 return;
425             }
426             if (isEmptyTokenAsNull()) {
427                 tok = null;
428             }
429         }
430         list.add(tok);
431     }
432 
433     /**
434      * Checks if tokenization has been done, and if not then do it.
435      */
436     private void checkTokenized() {
437         if (tokens == null) {
438             if (chars == null) {
439                 // still call tokenize as subclass may do some work
440                 final List<String> split = tokenize(null, 0, 0);
441                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
442             } else {
443                 final List<String> split = tokenize(chars, 0, chars.length);
444                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
445             }
446         }
447     }
448 
449     /**
450      * Creates a new instance of this Tokenizer. The new instance is reset so
451      * that it will be at the start of the token list.
452      * If a {@link CloneNotSupportedException} is caught, return {@code null}.
453      *
454      * @return a new instance of this Tokenizer which has been reset.
455      */
456     @Override
457     public Object clone() {
458         try {
459             return cloneReset();
460         } catch (final CloneNotSupportedException ex) {
461             return null;
462         }
463     }
464 
465     /**
466      * Creates a new instance of this Tokenizer. The new instance is reset so that
467      * it will be at the start of the token list.
468      *
469      * @return a new instance of this Tokenizer which has been reset.
470      * @throws CloneNotSupportedException if there is a problem cloning.
471      */
472     Object cloneReset() throws CloneNotSupportedException {
473         // this method exists to enable 100% test coverage
474         final StrTokenizer cloned = (StrTokenizer) super.clone();
475         if (cloned.chars != null) {
476             cloned.chars = cloned.chars.clone();
477         }
478         cloned.reset();
479         return cloned;
480     }
481 
482     /**
483      * Gets the String content that the tokenizer is parsing.
484      *
485      * @return the string content being parsed.
486      */
487     public String getContent() {
488         if (chars == null) {
489             return null;
490         }
491         return new String(chars);
492     }
493 
494     /**
495      * Gets the field delimiter matcher.
496      *
497      * @return the delimiter matcher in use.
498      */
499     public StrMatcher getDelimiterMatcher() {
500         return this.delimMatcher;
501     }
502 
503     /**
504      * Gets the ignored character matcher.
505      * <p>
506      * These characters are ignored when parsing the String, unless they are
507      * within a quoted region.
508      * The default value is not to ignore anything.
509      * </p>
510      *
511      * @return the ignored matcher in use.
512      */
513     public StrMatcher getIgnoredMatcher() {
514         return ignoredMatcher;
515     }
516 
517     /**
518      * Gets the quote matcher currently in use.
519      * <p>
520      * The quote character is used to wrap data between the tokens.
521      * This enables delimiters to be entered as data.
522      * The default value is '"' (double quote).
523      * </p>
524      *
525      * @return the quote matcher in use.
526      */
527     public StrMatcher getQuoteMatcher() {
528         return quoteMatcher;
529     }
530 
531     /**
532      * Gets a copy of the full token list as an independent modifiable array.
533      *
534      * @return the tokens as a String array.
535      */
536     public String[] getTokenArray() {
537         checkTokenized();
538         return tokens.clone();
539     }
540 
541     /**
542      * Gets a copy of the full token list as an independent modifiable list.
543      *
544      * @return the tokens as a String array.
545      */
546     public List<String> getTokenList() {
547         checkTokenized();
548         final List<String> list = new ArrayList<>(tokens.length);
549         list.addAll(Arrays.asList(tokens));
550         return list;
551     }
552 
553     /**
554      * Gets the trimmer character matcher.
555      * <p>
556      * These characters are trimmed off on each side of the delimiter
557      * until the token or quote is found.
558      * The default value is not to trim anything.
559      * </p>
560      *
561      * @return the trimmer matcher in use.
562      */
563     public StrMatcher getTrimmerMatcher() {
564         return trimmerMatcher;
565     }
566 
567     /**
568      * Checks whether there are any more tokens.
569      *
570      * @return true if there are more tokens.
571      */
572     @Override
573     public boolean hasNext() {
574         checkTokenized();
575         return tokenPos < tokens.length;
576     }
577 
578     /**
579      * Checks whether there are any previous tokens that can be iterated to.
580      *
581      * @return true if there are previous tokens.
582      */
583     @Override
584     public boolean hasPrevious() {
585         checkTokenized();
586         return tokenPos > 0;
587     }
588 
589     /**
590      * Gets whether the tokenizer currently returns empty tokens as null.
591      * The default for this property is false.
592      *
593      * @return true if empty tokens are returned as null.
594      */
595     public boolean isEmptyTokenAsNull() {
596         return this.emptyAsNull;
597     }
598 
599     /**
600      * Gets whether the tokenizer currently ignores empty tokens.
601      * The default for this property is true.
602      *
603      * @return true if empty tokens are not returned.
604      */
605     public boolean isIgnoreEmptyTokens() {
606         return ignoreEmptyTokens;
607     }
608 
609     /**
610      * Checks if the characters at the index specified match the quote
611      * already matched in readNextToken().
612      *
613      * @param srcChars  the character array being tokenized.
614      * @param pos  the position to check for a quote.
615      * @param len  the length of the character array being tokenized.
616      * @param quoteStart  the start position of the matched quote, 0 if no quoting.
617      * @param quoteLen  the length of the matched quote, 0 if no quoting.
618      * @return true if a quote is matched.
619      */
620     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
621         for (int i = 0; i < quoteLen; i++) {
622             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
623                 return false;
624             }
625         }
626         return true;
627     }
628 
629     /**
630      * Gets the next token.
631      *
632      * @return the next String token.
633      * @throws NoSuchElementException if there are no more elements.
634      */
635     @Override
636     public String next() {
637         if (hasNext()) {
638             return tokens[tokenPos++];
639         }
640         throw new NoSuchElementException();
641     }
642 
643     /**
644      * Gets the index of the next token to return.
645      *
646      * @return the next token index.
647      */
648     @Override
649     public int nextIndex() {
650         return tokenPos;
651     }
652 
653     /**
654      * Gets the next token from the String.
655      * Equivalent to {@link #next()} except it returns null rather than
656      * throwing {@link NoSuchElementException} when no tokens remain.
657      *
658      * @return the next sequential token, or null when no more tokens are found.
659      */
660     public String nextToken() {
661         if (hasNext()) {
662             return tokens[tokenPos++];
663         }
664         return null;
665     }
666 
667     /**
668      * Gets the token previous to the last returned token.
669      *
670      * @return the previous token.
671      */
672     @Override
673     public String previous() {
674         if (hasPrevious()) {
675             return tokens[--tokenPos];
676         }
677         throw new NoSuchElementException();
678     }
679 
680     /**
681      * Gets the index of the previous token.
682      *
683      * @return the previous token index.
684      */
685     @Override
686     public int previousIndex() {
687         return tokenPos - 1;
688     }
689 
690     /**
691      * Gets the previous token from the String.
692      *
693      * @return the previous sequential token, or null when no more tokens are found.
694      */
695     public String previousToken() {
696         if (hasPrevious()) {
697             return tokens[--tokenPos];
698         }
699         return null;
700     }
701 
702     /**
703      * Reads character by character through the String to get the next token.
704      *
705      * @param srcChars  the character array being tokenized.
706      * @param start  the first character of field.
707      * @param len  the length of the character array being tokenized.
708      * @param workArea  a temporary work area.
709      * @param tokenList  the list of parsed tokens.
710      * @return the starting position of the next field (the character
711      *  immediately after the delimiter), or -1 if end of string found.
712      */
713     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
714         // skip all leading whitespace, unless it is the
715         // field delimiter or the quote character
716         while (start < len) {
717             final int removeLen = Math.max(
718                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
719                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
720             if (removeLen == 0 ||
721                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
722                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
723                 break;
724             }
725             start += removeLen;
726         }
727 
728         // handle reaching end
729         if (start >= len) {
730             addToken(tokenList, StringUtils.EMPTY);
731             return -1;
732         }
733 
734         // handle empty token
735         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
736         if (delimLen > 0) {
737             addToken(tokenList, StringUtils.EMPTY);
738             return start + delimLen;
739         }
740 
741         // handle found token
742         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
743         if (quoteLen > 0) {
744             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
745         }
746         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
747     }
748 
749     /**
750      * Reads a possibly quoted string token.
751      *
752      * @param srcChars  the character array being tokenized.
753      * @param start  the first character of field.
754      * @param len  the length of the character array being tokenized.
755      * @param workArea  a temporary work area.
756      * @param tokenList  the list of parsed tokens.
757      * @param quoteStart  the start position of the matched quote, 0 if no quoting.
758      * @param quoteLen  the length of the matched quote, 0 if no quoting.
759      * @return the starting position of the next field (the character
760      *  immediately after the delimiter, or if end of string found,
761      *  then the length of string.
762      */
763     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
764                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
765         // Loop until we've found the end of the quoted
766         // string or the end of the input
767         workArea.clear();
768         int pos = start;
769         boolean quoting = quoteLen > 0;
770         int trimStart = 0;
771 
772         while (pos < len) {
773             // quoting mode can occur several times throughout a string
774             // we must switch between quoting and non-quoting until we
775             // encounter a non-quoted delimiter, or end of string
776             if (quoting) {
777                 // In quoting mode
778 
779                 // If we've found a quote character, see if it's
780                 // followed by a second quote.  If so, then we need
781                 // to actually put the quote character into the token
782                 // rather than end the token.
783                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
784                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
785                         // matched pair of quotes, thus an escaped quote
786                         workArea.append(srcChars, pos, quoteLen);
787                         pos += quoteLen * 2;
788                         trimStart = workArea.size();
789                         continue;
790                     }
791 
792                     // end of quoting
793                     quoting = false;
794                     pos += quoteLen;
795                     continue;
796                 }
797 
798             } else {
799                 // Not in quoting mode
800 
801                 // check for delimiter, and thus end of token
802                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
803                 if (delimLen > 0) {
804                     // return condition when end of token found
805                     addToken(tokenList, workArea.substring(0, trimStart));
806                     return pos + delimLen;
807                 }
808 
809                 // check for quote, and thus back into quoting mode
810                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
811                     quoting = true;
812                     pos += quoteLen;
813                     continue;
814                 }
815 
816                 // check for ignored (outside quotes), and ignore
817                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
818                 if (ignoredLen > 0) {
819                     pos += ignoredLen;
820                     continue;
821                 }
822 
823                 // check for trimmed character
824                 // don't yet know if it's at the end, so copy to workArea
825                 // use trimStart to keep track of trim at the end
826                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
827                 if (trimmedLen > 0) {
828                     workArea.append(srcChars, pos, trimmedLen);
829                     pos += trimmedLen;
830                     continue;
831                 }
832             }
833             // copy regular character from inside quotes
834             workArea.append(srcChars[pos++]);
835             trimStart = workArea.size();
836         }
837 
838         // return condition when end of string found
839         addToken(tokenList, workArea.substring(0, trimStart));
840         return -1;
841     }
842 
843     /**
844      * Unsupported ListIterator operation.
845      *
846      * @throws UnsupportedOperationException always.
847      */
848     @Override
849     public void remove() {
850         throw new UnsupportedOperationException("remove() is unsupported");
851     }
852 
853     /**
854      * Resets this tokenizer, forgetting all parsing and iteration already completed.
855      * <p>
856      * This method allows the same tokenizer to be reused for the same String.
857      * </p>
858      *
859      * @return {@code this} instance.
860      */
861     public StrTokenizer reset() {
862         tokenPos = 0;
863         tokens = null;
864         return this;
865     }
866 
867     /**
868      * Reset this tokenizer, giving it a new input string to parse.
869      * In this manner you can re-use a tokenizer with the same settings
870      * on multiple input lines.
871      *
872      * @param input  the new character array to tokenize, not cloned, null sets no text to parse.
873      * @return {@code this} instance.
874      */
875     public StrTokenizer reset(final char[] input) {
876         reset();
877         this.chars = ArrayUtils.clone(input);
878         return this;
879     }
880 
881     /**
882      * Reset this tokenizer, giving it a new input string to parse.
883      * In this manner you can re-use a tokenizer with the same settings
884      * on multiple input lines.
885      *
886      * @param input  the new string to tokenize, null sets no text to parse.
887      * @return {@code this} instance.
888      */
889     public StrTokenizer reset(final String input) {
890         reset();
891         if (input != null) {
892             this.chars = input.toCharArray();
893         } else {
894             this.chars = null;
895         }
896         return this;
897     }
898 
899     /**
900      * Unsupported ListIterator operation.
901      *
902      * @param obj this parameter ignored.
903      * @throws UnsupportedOperationException always.
904      */
905     @Override
906     public void set(final String obj) {
907         throw new UnsupportedOperationException("set() is unsupported");
908     }
909 
910     /**
911      * Sets the field delimiter character.
912      *
913      * @param delim  the delimiter character to use.
914      * @return {@code this} instance.
915      */
916     public StrTokenizer setDelimiterChar(final char delim) {
917         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
918     }
919 
920     /**
921      * Sets the field delimiter matcher.
922      * <p>
923      * The delimiter is used to separate one token from another.
924      * </p>
925      *
926      * @param delim  the delimiter matcher to use.
927      * @return {@code this} instance.
928      */
929     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
930         if (delim == null) {
931             this.delimMatcher = StrMatcher.noneMatcher();
932         } else {
933             this.delimMatcher = delim;
934         }
935         return this;
936     }
937 
938     /**
939      * Sets the field delimiter string.
940      *
941      * @param delim  the delimiter string to use.
942      * @return {@code this} instance.
943      */
944     public StrTokenizer setDelimiterString(final String delim) {
945         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
946     }
947 
948     /**
949      * Sets whether the tokenizer should return empty tokens as null.
950      * The default for this property is false.
951      *
952      * @param emptyAsNull  whether empty tokens are returned as null.
953      * @return {@code this} instance.
954      */
955     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
956         this.emptyAsNull = emptyAsNull;
957         return this;
958     }
959 
960     /**
961      * Sets the character to ignore.
962      * <p>
963      * This character is ignored when parsing the String, unless it is
964      * within a quoted region.
965      *
966      * @param ignored  the ignored character to use.
967      * @return {@code this} instance.
968      */
969     public StrTokenizer setIgnoredChar(final char ignored) {
970         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
971     }
972 
973     /**
974      * Sets the matcher for characters to ignore.
975      * <p>
976      * These characters are ignored when parsing the String, unless they are
977      * within a quoted region.
978      * </p>
979      *
980      * @param ignored  the ignored matcher to use, null ignored.
981      * @return {@code this} instance.
982      */
983     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
984         if (ignored != null) {
985             this.ignoredMatcher = ignored;
986         }
987         return this;
988     }
989 
990     /**
991      * Sets whether the tokenizer should ignore and not return empty tokens.
992      * The default for this property is true.
993      *
994      * @param ignoreEmptyTokens  whether empty tokens are not returned.
995      * @return {@code this} instance.
996      */
997     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
998         this.ignoreEmptyTokens = ignoreEmptyTokens;
999         return this;
1000     }
1001 
1002     /**
1003      * Sets the quote character to use.
1004      * <p>
1005      * The quote character is used to wrap data between the tokens.
1006      * This enables delimiters to be entered as data.
1007      * </p>
1008      *
1009      * @param quote  the quote character to use.
1010      * @return {@code this} instance.
1011      */
1012     public StrTokenizer setQuoteChar(final char quote) {
1013         return setQuoteMatcher(StrMatcher.charMatcher(quote));
1014     }
1015 
1016     /**
1017      * Sets the quote matcher to use.
1018      * <p>
1019      * The quote character is used to wrap data between the tokens.
1020      * This enables delimiters to be entered as data.
1021      * </p>
1022      *
1023      * @param quote  the quote matcher to use, null ignored.
1024      * @return {@code this} instance.
1025      */
1026     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1027         if (quote != null) {
1028             this.quoteMatcher = quote;
1029         }
1030         return this;
1031     }
1032 
1033     /**
1034      * Sets the matcher for characters to trim.
1035      * <p>
1036      * These characters are trimmed off on each side of the delimiter
1037      * until the token or quote is found.
1038      * </p>
1039      *
1040      * @param trimmer  the trimmer matcher to use, null ignored.
1041      * @return {@code this} instance.
1042      */
1043     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1044         if (trimmer != null) {
1045             this.trimmerMatcher = trimmer;
1046         }
1047         return this;
1048     }
1049 
1050     /**
1051      * Gets the number of tokens found in the String.
1052      *
1053      * @return the number of matched tokens.
1054      */
1055     public int size() {
1056         checkTokenized();
1057         return tokens.length;
1058     }
1059 
1060     /**
1061      * Internal method to performs the tokenization.
1062      * <p>
1063      * Most users of this class do not need to call this method. This method
1064      * will be called automatically by other (public) methods when required.
1065      * </p>
1066      * <p>
1067      * This method exists to allow subclasses to add code before or after the
1068      * tokenization. For example, a subclass could alter the character array,
1069      * offset or count to be parsed, or call the tokenizer multiple times on
1070      * multiple strings. It is also be possible to filter the results.
1071      * </p>
1072      * <p>
1073      * {@link StrTokenizer} will always pass a zero offset and a count
1074      * equal to the length of the array to this method, however a subclass
1075      * may pass other values, or even an entirely different array.
1076      * </p>
1077      *
1078      * @param srcChars  the character array being tokenized, may be null.
1079      * @param offset  the start position within the character array, must be valid.
1080      * @param count  the number of characters to tokenize, must be valid.
1081      * @return the modifiable list of String tokens, unmodifiable if null array or zero count.
1082      */
1083     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1084         if (ArrayUtils.isEmpty(srcChars)) {
1085             return Collections.emptyList();
1086         }
1087         final StrBuilder buf = new StrBuilder();
1088         final List<String> tokenList = new ArrayList<>();
1089         int pos = offset;
1090 
1091         // loop around the entire buffer
1092         while (pos >= 0 && pos < count) {
1093             // find next token
1094             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1095 
1096             // handle case where end of string is a delimiter
1097             if (pos >= count) {
1098                 addToken(tokenList, StringUtils.EMPTY);
1099             }
1100         }
1101         return tokenList;
1102     }
1103 
1104     /**
1105      * Gets the String content that the tokenizer is parsing.
1106      *
1107      * @return the string content being parsed.
1108      */
1109     @Override
1110     public String toString() {
1111         if (tokens == null) {
1112             return "StrTokenizer[not tokenized yet]";
1113         }
1114         return "StrTokenizer" + getTokenList();
1115     }
1116 
1117 }