View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.lang3.text;
18  
19  import java.util.ArrayList;
20  import java.util.Arrays;
21  import java.util.Collections;
22  import java.util.List;
23  import java.util.ListIterator;
24  import java.util.NoSuchElementException;
25  import java.util.StringTokenizer;
26  
27  import org.apache.commons.lang3.ArrayUtils;
28  import org.apache.commons.lang3.StringUtils;
29  
30  /**
31   * Tokenizes a string based on delimiters (separators)
32   * and supporting quoting and ignored character concepts.
33   * <p>
34   * This class can split a String into many smaller strings. It aims
35   * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
36   * however it offers much more control and flexibility including implementing
37   * the {@link ListIterator} interface. By default, it is set up
38   * like {@link StringTokenizer}.
39   * </p>
40   * <p>
41   * The input String is split into a number of <em>tokens</em>.
42   * Each token is separated from the next String by a <em>delimiter</em>.
43   * One or more delimiter characters must be specified.
44   * </p>
45   * <p>
46   * Each token may be surrounded by quotes.
47   * The <em>quote</em> matcher specifies the quote character(s).
48   * A quote may be escaped within a quoted section by duplicating itself.
49   * </p>
50   * <p>
51   * Between each token and the delimiter are potentially characters that need trimming.
52   * The <em>trimmer</em> matcher specifies these characters.
53   * One usage might be to trim whitespace characters.
54   * </p>
55   * <p>
56   * At any point outside the quotes there might potentially be invalid characters.
57   * The <em>ignored</em> matcher specifies these characters to be removed.
58   * One usage might be to remove new line characters.
59   * </p>
60   * <p>
61   * Empty tokens may be removed or returned as null.
62   * </p>
63   * <pre>
64   * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
65   * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
66   * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
67   * </pre>
68   *
69   * <table>
70   *  <caption>StrTokenizer properties and options</caption>
71   *  <tr>
72   *   <th>Property</th><th>Type</th><th>Default</th>
73   *  </tr>
74   *  <tr>
75   *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
76   *  </tr>
77   *  <tr>
78   *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
79   *  </tr>
80   *  <tr>
81   *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
82   *  </tr>
83   *  <tr>
84   *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
85   *  </tr>
86   *  <tr>
87   *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
88   *  </tr>
89   * </table>
90   *
91   * @since 2.2
92   * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text
93   * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
94   * StringTokenizer</a>.
95   */
96  @Deprecated
97  public class StrTokenizer implements ListIterator<String>, Cloneable {
98  
99      // @formatter:off
100     private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
101             .setDelimiterMatcher(StrMatcher.commaMatcher())
102             .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
103             .setIgnoredMatcher(StrMatcher.noneMatcher())
104             .setTrimmerMatcher(StrMatcher.trimMatcher())
105             .setEmptyTokenAsNull(false)
106             .setIgnoreEmptyTokens(false);
107     // @formatter:on
108 
109     // @formatter:off
110     private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
111             .setDelimiterMatcher(StrMatcher.tabMatcher())
112             .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
113             .setIgnoredMatcher(StrMatcher.noneMatcher())
114             .setTrimmerMatcher(StrMatcher.trimMatcher())
115             .setEmptyTokenAsNull(false)
116             .setIgnoreEmptyTokens(false);
117     // @formatter:on
118 
119     /**
120      * Gets a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121      *
122      * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123      */
124     private static StrTokenizer getCSVClone() {
125         return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126     }
127     /**
128      * Gets a new tokenizer instance which parses Comma Separated Value strings
129      * initializing it with the given input.  The default for CSV processing
130      * will be trim whitespace from both ends (which can be overridden with
131      * the setTrimmer method).
132      * <p>
133      * You must call a "reset" method to set the string which you want to parse.
134      * </p>
135      * @return a new tokenizer instance which parses Comma Separated Value strings.
136      */
137     public static StrTokenizer getCSVInstance() {
138         return getCSVClone();
139     }
140     /**
141      * Gets a new tokenizer instance which parses Comma Separated Value strings
142      * initializing it with the given input.  The default for CSV processing
143      * will be trim whitespace from both ends (which can be overridden with
144      * the setTrimmer method).
145      *
146      * @param input  the text to parse.
147      * @return a new tokenizer instance which parses Comma Separated Value strings.
148      */
149     public static StrTokenizer getCSVInstance(final char[] input) {
150         final StrTokenizer tok = getCSVClone();
151         tok.reset(input);
152         return tok;
153     }
154 
155     /**
156      * Gets a new tokenizer instance which parses Comma Separated Value strings
157      * initializing it with the given input.  The default for CSV processing
158      * will be trim whitespace from both ends (which can be overridden with
159      * the setTrimmer method).
160      *
161      * @param input  the text to parse.
162      * @return a new tokenizer instance which parses Comma Separated Value strings.
163      */
164     public static StrTokenizer getCSVInstance(final String input) {
165         final StrTokenizer tok = getCSVClone();
166         tok.reset(input);
167         return tok;
168     }
169     /**
170      * Gets a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
171      *
172      * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
173      */
174     private static StrTokenizer getTSVClone() {
175         return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
176     }
177 
178     /**
179      * Gets a new tokenizer instance which parses Tab Separated Value strings.
180      * The default for CSV processing will be trim whitespace from both ends
181      * (which can be overridden with the setTrimmer method).
182      * <p>
183      * You must call a "reset" method to set the string which you want to parse.
184      * </p>
185      * @return a new tokenizer instance which parses Tab Separated Value strings.
186      */
187     public static StrTokenizer getTSVInstance() {
188         return getTSVClone();
189     }
190 
191     /**
192      * Gets a new tokenizer instance which parses Tab Separated Value strings.
193      * The default for CSV processing will be trim whitespace from both ends
194      * (which can be overridden with the setTrimmer method).
195      *
196      * @param input  the string to parse.
197      * @return a new tokenizer instance which parses Tab Separated Value strings.
198      */
199     public static StrTokenizer getTSVInstance(final char[] input) {
200         final StrTokenizer tok = getTSVClone();
201         tok.reset(input);
202         return tok;
203     }
204 
205     /**
206      * Gets a new tokenizer instance which parses Tab Separated Value strings.
207      * The default for CSV processing will be trim whitespace from both ends
208      * (which can be overridden with the setTrimmer method).
209      *
210      * @param input  the string to parse.
211      * @return a new tokenizer instance which parses Tab Separated Value strings.
212      */
213     public static StrTokenizer getTSVInstance(final String input) {
214         final StrTokenizer tok = getTSVClone();
215         tok.reset(input);
216         return tok;
217     }
218     /** The text to work on. */
219     private char[] chars;
220 
221     /** The parsed tokens */
222     private String[] tokens;
223 
224     /** The current iteration position */
225     private int tokenPos;
226 
227     /** The delimiter matcher */
228     private StrMatcher delimMatcher = StrMatcher.splitMatcher();
229 
230     /** The quote matcher */
231     private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
232 
233     /** The ignored matcher */
234     private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
235 
236     /** The trimmer matcher */
237     private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
238 
239     /** Whether to return empty tokens as null */
240     private boolean emptyAsNull;
241 
242     /** Whether to ignore empty tokens */
243     private boolean ignoreEmptyTokens = true;
244 
245     /**
246      * Constructs a tokenizer splitting on space, tab, newline and formfeed
247      * as per StringTokenizer, but with no text to tokenize.
248      * <p>
249      * This constructor is normally used with {@link #reset(String)}.
250      * </p>
251      */
252     public StrTokenizer() {
253         this.chars = null;
254     }
255 
256     /**
257      * Constructs a tokenizer splitting on space, tab, newline and formfeed
258      * as per StringTokenizer.
259      *
260      * @param input  the string which is to be parsed, not cloned.
261      */
262     public StrTokenizer(final char[] input) {
263         this.chars = ArrayUtils.clone(input);
264     }
265 
266     /**
267      * Constructs a tokenizer splitting on the specified character.
268      *
269      * @param input  the string which is to be parsed, not cloned.
270      * @param delim the field delimiter character.
271      */
272     public StrTokenizer(final char[] input, final char delim) {
273         this(input);
274         setDelimiterChar(delim);
275     }
276 
277     /**
278      * Constructs a tokenizer splitting on the specified delimiter character
279      * and handling quotes using the specified quote character.
280      *
281      * @param input  the string which is to be parsed, not cloned.
282      * @param delim  the field delimiter character.
283      * @param quote  the field quoted string character.
284      */
285     public StrTokenizer(final char[] input, final char delim, final char quote) {
286         this(input, delim);
287         setQuoteChar(quote);
288     }
289 
290     /**
291      * Constructs a tokenizer splitting on the specified string.
292      *
293      * @param input  the string which is to be parsed, not cloned.
294      * @param delim the field delimiter string.
295      */
296     public StrTokenizer(final char[] input, final String delim) {
297         this(input);
298         setDelimiterString(delim);
299     }
300 
301     /**
302      * Constructs a tokenizer splitting using the specified delimiter matcher.
303      *
304      * @param input  the string which is to be parsed, not cloned.
305      * @param delim  the field delimiter matcher.
306      */
307     public StrTokenizer(final char[] input, final StrMatcher delim) {
308         this(input);
309         setDelimiterMatcher(delim);
310     }
311 
312     /**
313      * Constructs a tokenizer splitting using the specified delimiter matcher
314      * and handling quotes using the specified quote matcher.
315      *
316      * @param input  the string which is to be parsed, not cloned.
317      * @param delim  the field delimiter character.
318      * @param quote  the field quoted string character.
319      */
320     public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
321         this(input, delim);
322         setQuoteMatcher(quote);
323     }
324 
325     /**
326      * Constructs a tokenizer splitting on space, tab, newline and formfeed
327      * as per StringTokenizer.
328      *
329      * @param input  the string which is to be parsed.
330      */
331     public StrTokenizer(final String input) {
332         if (input != null) {
333             chars = input.toCharArray();
334         } else {
335             chars = null;
336         }
337     }
338 
339     /**
340      * Constructs a tokenizer splitting on the specified delimiter character.
341      *
342      * @param input  the string which is to be parsed.
343      * @param delim  the field delimiter character.
344      */
345     public StrTokenizer(final String input, final char delim) {
346         this(input);
347         setDelimiterChar(delim);
348     }
349 
350     /**
351      * Constructs a tokenizer splitting on the specified delimiter character
352      * and handling quotes using the specified quote character.
353      *
354      * @param input  the string which is to be parsed.
355      * @param delim  the field delimiter character.
356      * @param quote  the field quoted string character.
357      */
358     public StrTokenizer(final String input, final char delim, final char quote) {
359         this(input, delim);
360         setQuoteChar(quote);
361     }
362 
363     /**
364      * Constructs a tokenizer splitting on the specified delimiter string.
365      *
366      * @param input  the string which is to be parsed.
367      * @param delim  the field delimiter string.
368      */
369     public StrTokenizer(final String input, final String delim) {
370         this(input);
371         setDelimiterString(delim);
372     }
373 
374     /**
375      * Constructs a tokenizer splitting using the specified delimiter matcher.
376      *
377      * @param input  the string which is to be parsed.
378      * @param delim  the field delimiter matcher.
379      */
380     public StrTokenizer(final String input, final StrMatcher delim) {
381         this(input);
382         setDelimiterMatcher(delim);
383     }
384 
385     /**
386      * Constructs a tokenizer splitting using the specified delimiter matcher
387      * and handling quotes using the specified quote matcher.
388      *
389      * @param input  the string which is to be parsed.
390      * @param delim  the field delimiter matcher.
391      * @param quote  the field quoted string matcher.
392      */
393     public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
394         this(input, delim);
395         setQuoteMatcher(quote);
396     }
397 
398     /**
399      * Unsupported ListIterator operation.
400      *
401      * @param obj this parameter ignored.
402      * @throws UnsupportedOperationException always.
403      */
404     @Override
405     public void add(final String obj) {
406         throw new UnsupportedOperationException("add() is unsupported");
407     }
408 
409     /**
410      * Adds a token to a list, paying attention to the parameters we've set.
411      *
412      * @param list  the list to add to.
413      * @param tok  the token to add.
414      */
415     private void addToken(final List<String> list, String tok) {
416         if (StringUtils.isEmpty(tok)) {
417             if (isIgnoreEmptyTokens()) {
418                 return;
419             }
420             if (isEmptyTokenAsNull()) {
421                 tok = null;
422             }
423         }
424         list.add(tok);
425     }
426 
427     /**
428      * Checks if tokenization has been done, and if not then do it.
429      */
430     private void checkTokenized() {
431         if (tokens == null) {
432             if (chars == null) {
433                 // still call tokenize as subclass may do some work
434                 final List<String> split = tokenize(null, 0, 0);
435                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
436             } else {
437                 final List<String> split = tokenize(chars, 0, chars.length);
438                 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
439             }
440         }
441     }
442 
443     /**
444      * Creates a new instance of this Tokenizer. The new instance is reset so
445      * that it will be at the start of the token list.
446      * If a {@link CloneNotSupportedException} is caught, return {@code null}.
447      *
448      * @return a new instance of this Tokenizer which has been reset.
449      */
450     @Override
451     public Object clone() {
452         try {
453             return cloneReset();
454         } catch (final CloneNotSupportedException ex) {
455             return null;
456         }
457     }
458 
459     /**
460      * Creates a new instance of this Tokenizer. The new instance is reset so that
461      * it will be at the start of the token list.
462      *
463      * @return a new instance of this Tokenizer which has been reset.
464      * @throws CloneNotSupportedException if there is a problem cloning.
465      */
466     Object cloneReset() throws CloneNotSupportedException {
467         // this method exists to enable 100% test coverage
468         final StrTokenizer cloned = (StrTokenizer) super.clone();
469         if (cloned.chars != null) {
470             cloned.chars = cloned.chars.clone();
471         }
472         cloned.reset();
473         return cloned;
474     }
475 
476     /**
477      * Gets the String content that the tokenizer is parsing.
478      *
479      * @return the string content being parsed.
480      */
481     public String getContent() {
482         if (chars == null) {
483             return null;
484         }
485         return new String(chars);
486     }
487 
488     /**
489      * Gets the field delimiter matcher.
490      *
491      * @return the delimiter matcher in use.
492      */
493     public StrMatcher getDelimiterMatcher() {
494         return this.delimMatcher;
495     }
496 
497     // Ignored
498     /**
499      * Gets the ignored character matcher.
500      * <p>
501      * These characters are ignored when parsing the String, unless they are
502      * within a quoted region.
503      * The default value is not to ignore anything.
504      * </p>
505      *
506      * @return the ignored matcher in use.
507      */
508     public StrMatcher getIgnoredMatcher() {
509         return ignoredMatcher;
510     }
511 
512     /**
513      * Gets the quote matcher currently in use.
514      * <p>
515      * The quote character is used to wrap data between the tokens.
516      * This enables delimiters to be entered as data.
517      * The default value is '"' (double quote).
518      * </p>
519      *
520      * @return the quote matcher in use.
521      */
522     public StrMatcher getQuoteMatcher() {
523         return quoteMatcher;
524     }
525 
526     /**
527      * Gets a copy of the full token list as an independent modifiable array.
528      *
529      * @return the tokens as a String array.
530      */
531     public String[] getTokenArray() {
532         checkTokenized();
533         return tokens.clone();
534     }
535 
536     /**
537      * Gets a copy of the full token list as an independent modifiable list.
538      *
539      * @return the tokens as a String array.
540      */
541     public List<String> getTokenList() {
542         checkTokenized();
543         final List<String> list = new ArrayList<>(tokens.length);
544         list.addAll(Arrays.asList(tokens));
545         return list;
546     }
547 
548     /**
549      * Gets the trimmer character matcher.
550      * <p>
551      * These characters are trimmed off on each side of the delimiter
552      * until the token or quote is found.
553      * The default value is not to trim anything.
554      * </p>
555      *
556      * @return the trimmer matcher in use.
557      */
558     public StrMatcher getTrimmerMatcher() {
559         return trimmerMatcher;
560     }
561 
562     /**
563      * Checks whether there are any more tokens.
564      *
565      * @return true if there are more tokens.
566      */
567     @Override
568     public boolean hasNext() {
569         checkTokenized();
570         return tokenPos < tokens.length;
571     }
572 
573     /**
574      * Checks whether there are any previous tokens that can be iterated to.
575      *
576      * @return true if there are previous tokens.
577      */
578     @Override
579     public boolean hasPrevious() {
580         checkTokenized();
581         return tokenPos > 0;
582     }
583 
584     /**
585      * Gets whether the tokenizer currently returns empty tokens as null.
586      * The default for this property is false.
587      *
588      * @return true if empty tokens are returned as null.
589      */
590     public boolean isEmptyTokenAsNull() {
591         return this.emptyAsNull;
592     }
593 
594     /**
595      * Gets whether the tokenizer currently ignores empty tokens.
596      * The default for this property is true.
597      *
598      * @return true if empty tokens are not returned.
599      */
600     public boolean isIgnoreEmptyTokens() {
601         return ignoreEmptyTokens;
602     }
603 
604     /**
605      * Checks if the characters at the index specified match the quote
606      * already matched in readNextToken().
607      *
608      * @param srcChars  the character array being tokenized.
609      * @param pos  the position to check for a quote.
610      * @param len  the length of the character array being tokenized.
611      * @param quoteStart  the start position of the matched quote, 0 if no quoting.
612      * @param quoteLen  the length of the matched quote, 0 if no quoting.
613      * @return true if a quote is matched.
614      */
615     private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
616         for (int i = 0; i < quoteLen; i++) {
617             if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
618                 return false;
619             }
620         }
621         return true;
622     }
623 
624     /**
625      * Gets the next token.
626      *
627      * @return the next String token.
628      * @throws NoSuchElementException if there are no more elements.
629      */
630     @Override
631     public String next() {
632         if (hasNext()) {
633             return tokens[tokenPos++];
634         }
635         throw new NoSuchElementException();
636     }
637 
638     /**
639      * Gets the index of the next token to return.
640      *
641      * @return the next token index.
642      */
643     @Override
644     public int nextIndex() {
645         return tokenPos;
646     }
647 
648     /**
649      * Gets the next token from the String.
650      * Equivalent to {@link #next()} except it returns null rather than
651      * throwing {@link NoSuchElementException} when no tokens remain.
652      *
653      * @return the next sequential token, or null when no more tokens are found.
654      */
655     public String nextToken() {
656         if (hasNext()) {
657             return tokens[tokenPos++];
658         }
659         return null;
660     }
661 
662     /**
663      * Gets the token previous to the last returned token.
664      *
665      * @return the previous token.
666      */
667     @Override
668     public String previous() {
669         if (hasPrevious()) {
670             return tokens[--tokenPos];
671         }
672         throw new NoSuchElementException();
673     }
674 
675     /**
676      * Gets the index of the previous token.
677      *
678      * @return the previous token index.
679      */
680     @Override
681     public int previousIndex() {
682         return tokenPos - 1;
683     }
684 
685     /**
686      * Gets the previous token from the String.
687      *
688      * @return the previous sequential token, or null when no more tokens are found.
689      */
690     public String previousToken() {
691         if (hasPrevious()) {
692             return tokens[--tokenPos];
693         }
694         return null;
695     }
696 
697     /**
698      * Reads character by character through the String to get the next token.
699      *
700      * @param srcChars  the character array being tokenized.
701      * @param start  the first character of field.
702      * @param len  the length of the character array being tokenized.
703      * @param workArea  a temporary work area.
704      * @param tokenList  the list of parsed tokens.
705      * @return the starting position of the next field (the character
706      *  immediately after the delimiter), or -1 if end of string found.
707      */
708     private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
709         // skip all leading whitespace, unless it is the
710         // field delimiter or the quote character
711         while (start < len) {
712             final int removeLen = Math.max(
713                     getIgnoredMatcher().isMatch(srcChars, start, start, len),
714                     getTrimmerMatcher().isMatch(srcChars, start, start, len));
715             if (removeLen == 0 ||
716                 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
717                 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
718                 break;
719             }
720             start += removeLen;
721         }
722 
723         // handle reaching end
724         if (start >= len) {
725             addToken(tokenList, StringUtils.EMPTY);
726             return -1;
727         }
728 
729         // handle empty token
730         final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
731         if (delimLen > 0) {
732             addToken(tokenList, StringUtils.EMPTY);
733             return start + delimLen;
734         }
735 
736         // handle found token
737         final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
738         if (quoteLen > 0) {
739             return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
740         }
741         return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
742     }
743 
744     /**
745      * Reads a possibly quoted string token.
746      *
747      * @param srcChars  the character array being tokenized.
748      * @param start  the first character of field.
749      * @param len  the length of the character array being tokenized.
750      * @param workArea  a temporary work area.
751      * @param tokenList  the list of parsed tokens.
752      * @param quoteStart  the start position of the matched quote, 0 if no quoting.
753      * @param quoteLen  the length of the matched quote, 0 if no quoting.
754      * @return the starting position of the next field (the character
755      *  immediately after the delimiter, or if end of string found,
756      *  then the length of string.
757      */
758     private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
759                                final List<String> tokenList, final int quoteStart, final int quoteLen) {
760         // Loop until we've found the end of the quoted
761         // string or the end of the input
762         workArea.clear();
763         int pos = start;
764         boolean quoting = quoteLen > 0;
765         int trimStart = 0;
766 
767         while (pos < len) {
768             // quoting mode can occur several times throughout a string
769             // we must switch between quoting and non-quoting until we
770             // encounter a non-quoted delimiter, or end of string
771             if (quoting) {
772                 // In quoting mode
773 
774                 // If we've found a quote character, see if it's
775                 // followed by a second quote.  If so, then we need
776                 // to actually put the quote character into the token
777                 // rather than end the token.
778                 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
779                     if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
780                         // matched pair of quotes, thus an escaped quote
781                         workArea.append(srcChars, pos, quoteLen);
782                         pos += quoteLen * 2;
783                         trimStart = workArea.size();
784                         continue;
785                     }
786 
787                     // end of quoting
788                     quoting = false;
789                     pos += quoteLen;
790                     continue;
791                 }
792 
793             } else {
794                 // Not in quoting mode
795 
796                 // check for delimiter, and thus end of token
797                 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
798                 if (delimLen > 0) {
799                     // return condition when end of token found
800                     addToken(tokenList, workArea.substring(0, trimStart));
801                     return pos + delimLen;
802                 }
803 
804                 // check for quote, and thus back into quoting mode
805                 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
806                     quoting = true;
807                     pos += quoteLen;
808                     continue;
809                 }
810 
811                 // check for ignored (outside quotes), and ignore
812                 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
813                 if (ignoredLen > 0) {
814                     pos += ignoredLen;
815                     continue;
816                 }
817 
818                 // check for trimmed character
819                 // don't yet know if it's at the end, so copy to workArea
820                 // use trimStart to keep track of trim at the end
821                 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
822                 if (trimmedLen > 0) {
823                     workArea.append(srcChars, pos, trimmedLen);
824                     pos += trimmedLen;
825                     continue;
826                 }
827             }
828             // copy regular character from inside quotes
829             workArea.append(srcChars[pos++]);
830             trimStart = workArea.size();
831         }
832 
833         // return condition when end of string found
834         addToken(tokenList, workArea.substring(0, trimStart));
835         return -1;
836     }
837 
838     /**
839      * Unsupported ListIterator operation.
840      *
841      * @throws UnsupportedOperationException always.
842      */
843     @Override
844     public void remove() {
845         throw new UnsupportedOperationException("remove() is unsupported");
846     }
847 
848     /**
849      * Resets this tokenizer, forgetting all parsing and iteration already completed.
850      * <p>
851      * This method allows the same tokenizer to be reused for the same String.
852      * </p>
853      *
854      * @return {@code this} instance.
855      */
856     public StrTokenizer reset() {
857         tokenPos = 0;
858         tokens = null;
859         return this;
860     }
861 
862     /**
863      * Reset this tokenizer, giving it a new input string to parse.
864      * In this manner you can re-use a tokenizer with the same settings
865      * on multiple input lines.
866      *
867      * @param input  the new character array to tokenize, not cloned, null sets no text to parse.
868      * @return {@code this} instance.
869      */
870     public StrTokenizer reset(final char[] input) {
871         reset();
872         this.chars = ArrayUtils.clone(input);
873         return this;
874     }
875 
876     /**
877      * Reset this tokenizer, giving it a new input string to parse.
878      * In this manner you can re-use a tokenizer with the same settings
879      * on multiple input lines.
880      *
881      * @param input  the new string to tokenize, null sets no text to parse.
882      * @return {@code this} instance.
883      */
884     public StrTokenizer reset(final String input) {
885         reset();
886         if (input != null) {
887             this.chars = input.toCharArray();
888         } else {
889             this.chars = null;
890         }
891         return this;
892     }
893 
894     /**
895      * Unsupported ListIterator operation.
896      *
897      * @param obj this parameter ignored.
898      * @throws UnsupportedOperationException always.
899      */
900     @Override
901     public void set(final String obj) {
902         throw new UnsupportedOperationException("set() is unsupported");
903     }
904 
905     /**
906      * Sets the field delimiter character.
907      *
908      * @param delim  the delimiter character to use.
909      * @return this, to enable chaining.
910      */
911     public StrTokenizer setDelimiterChar(final char delim) {
912         return setDelimiterMatcher(StrMatcher.charMatcher(delim));
913     }
914 
915     /**
916      * Sets the field delimiter matcher.
917      * <p>
918      * The delimiter is used to separate one token from another.
919      * </p>
920      *
921      * @param delim  the delimiter matcher to use.
922      * @return this, to enable chaining.
923      */
924     public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
925         if (delim == null) {
926             this.delimMatcher = StrMatcher.noneMatcher();
927         } else {
928             this.delimMatcher = delim;
929         }
930         return this;
931     }
932 
933     /**
934      * Sets the field delimiter string.
935      *
936      * @param delim  the delimiter string to use.
937      * @return this, to enable chaining.
938      */
939     public StrTokenizer setDelimiterString(final String delim) {
940         return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
941     }
942 
943     /**
944      * Sets whether the tokenizer should return empty tokens as null.
945      * The default for this property is false.
946      *
947      * @param emptyAsNull  whether empty tokens are returned as null.
948      * @return this, to enable chaining.
949      */
950     public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
951         this.emptyAsNull = emptyAsNull;
952         return this;
953     }
954 
955     /**
956      * Sets the character to ignore.
957      * <p>
958      * This character is ignored when parsing the String, unless it is
959      * within a quoted region.
960      *
961      * @param ignored  the ignored character to use.
962      * @return this, to enable chaining.
963      */
964     public StrTokenizer setIgnoredChar(final char ignored) {
965         return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
966     }
967 
968     /**
969      * Sets the matcher for characters to ignore.
970      * <p>
971      * These characters are ignored when parsing the String, unless they are
972      * within a quoted region.
973      * </p>
974      *
975      * @param ignored  the ignored matcher to use, null ignored.
976      * @return {@code this} instance.
977      */
978     public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
979         if (ignored != null) {
980             this.ignoredMatcher = ignored;
981         }
982         return this;
983     }
984 
985     /**
986      * Sets whether the tokenizer should ignore and not return empty tokens.
987      * The default for this property is true.
988      *
989      * @param ignoreEmptyTokens  whether empty tokens are not returned.
990      * @return {@code this} instance.
991      */
992     public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
993         this.ignoreEmptyTokens = ignoreEmptyTokens;
994         return this;
995     }
996 
997     /**
998      * Sets the quote character to use.
999      * <p>
1000      * The quote character is used to wrap data between the tokens.
1001      * This enables delimiters to be entered as data.
1002      * </p>
1003      *
1004      * @param quote  the quote character to use.
1005      * @return {@code this} instance.
1006      */
1007     public StrTokenizer setQuoteChar(final char quote) {
1008         return setQuoteMatcher(StrMatcher.charMatcher(quote));
1009     }
1010 
1011     /**
1012      * Sets the quote matcher to use.
1013      * <p>
1014      * The quote character is used to wrap data between the tokens.
1015      * This enables delimiters to be entered as data.
1016      * </p>
1017      *
1018      * @param quote  the quote matcher to use, null ignored.
1019      * @return {@code this} instance.
1020      */
1021     public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1022         if (quote != null) {
1023             this.quoteMatcher = quote;
1024         }
1025         return this;
1026     }
1027 
1028     /**
1029      * Sets the matcher for characters to trim.
1030      * <p>
1031      * These characters are trimmed off on each side of the delimiter
1032      * until the token or quote is found.
1033      * </p>
1034      *
1035      * @param trimmer  the trimmer matcher to use, null ignored.
1036      * @return {@code this} instance.
1037      */
1038     public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1039         if (trimmer != null) {
1040             this.trimmerMatcher = trimmer;
1041         }
1042         return this;
1043     }
1044 
1045     // API
1046     /**
1047      * Gets the number of tokens found in the String.
1048      *
1049      * @return the number of matched tokens.
1050      */
1051     public int size() {
1052         checkTokenized();
1053         return tokens.length;
1054     }
1055 
1056     /**
1057      * Internal method to performs the tokenization.
1058      * <p>
1059      * Most users of this class do not need to call this method. This method
1060      * will be called automatically by other (public) methods when required.
1061      * </p>
1062      * <p>
1063      * This method exists to allow subclasses to add code before or after the
1064      * tokenization. For example, a subclass could alter the character array,
1065      * offset or count to be parsed, or call the tokenizer multiple times on
1066      * multiple strings. It is also be possible to filter the results.
1067      * </p>
1068      * <p>
1069      * {@link StrTokenizer} will always pass a zero offset and a count
1070      * equal to the length of the array to this method, however a subclass
1071      * may pass other values, or even an entirely different array.
1072      * </p>
1073      *
1074      * @param srcChars  the character array being tokenized, may be null.
1075      * @param offset  the start position within the character array, must be valid.
1076      * @param count  the number of characters to tokenize, must be valid.
1077      * @return the modifiable list of String tokens, unmodifiable if null array or zero count.
1078      */
1079     protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1080         if (ArrayUtils.isEmpty(srcChars)) {
1081             return Collections.emptyList();
1082         }
1083         final StrBuilder buf = new StrBuilder();
1084         final List<String> tokenList = new ArrayList<>();
1085         int pos = offset;
1086 
1087         // loop around the entire buffer
1088         while (pos >= 0 && pos < count) {
1089             // find next token
1090             pos = readNextToken(srcChars, pos, count, buf, tokenList);
1091 
1092             // handle case where end of string is a delimiter
1093             if (pos >= count) {
1094                 addToken(tokenList, StringUtils.EMPTY);
1095             }
1096         }
1097         return tokenList;
1098     }
1099 
1100     /**
1101      * Gets the String content that the tokenizer is parsing.
1102      *
1103      * @return the string content being parsed.
1104      */
1105     @Override
1106     public String toString() {
1107         if (tokens == null) {
1108             return "StrTokenizer[not tokenized yet]";
1109         }
1110         return "StrTokenizer" + getTokenList();
1111     }
1112 
1113 }