Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025import java.util.StringTokenizer;
026
027import org.apache.commons.lang3.ArrayUtils;
028import org.apache.commons.lang3.StringUtils;
029
030/**
031 * Tokenizes a string based on delimiters (separators)
032 * and supporting quoting and ignored character concepts.
033 * <p>
034 * This class can split a String into many smaller strings. It aims
035 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
036 * however it offers much more control and flexibility including implementing
037 * the {@link ListIterator} interface. By default, it is set up
038 * like {@link StringTokenizer}.
039 * </p>
040 * <p>
041 * The input String is split into a number of <em>tokens</em>.
042 * Each token is separated from the next String by a <em>delimiter</em>.
043 * One or more delimiter characters must be specified.
044 * </p>
045 * <p>
046 * Each token may be surrounded by quotes.
047 * The <em>quote</em> matcher specifies the quote character(s).
048 * A quote may be escaped within a quoted section by duplicating itself.
049 * </p>
050 * <p>
051 * Between each token and the delimiter are potentially characters that need trimming.
052 * The <em>trimmer</em> matcher specifies these characters.
053 * One usage might be to trim whitespace characters.
054 * </p>
055 * <p>
056 * At any point outside the quotes there might potentially be invalid characters.
057 * The <em>ignored</em> matcher specifies these characters to be removed.
058 * One usage might be to remove new line characters.
059 * </p>
060 * <p>
061 * Empty tokens may be removed or returned as null.
062 * </p>
063 * <pre>
064 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
065 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
066 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
067 * </pre>
068 *
069 * <table>
070 *  <caption>StrTokenizer properties and options</caption>
071 *  <tr>
072 *   <th>Property</th><th>Type</th><th>Default</th>
073 *  </tr>
074 *  <tr>
075 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
076 *  </tr>
077 *  <tr>
078 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
079 *  </tr>
080 *  <tr>
081 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
082 *  </tr>
083 *  <tr>
084 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
085 *  </tr>
086 *  <tr>
087 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
088 *  </tr>
089 * </table>
090 *
091 * @since 2.2
092 * @deprecated As of <a href="https://commons.apache.org/proper/commons-lang/changes-report.html#a3.6">3.6</a>, use Apache Commons Text
093 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
094 * StringTokenizer</a>.
095 */
096@Deprecated
097public class StrTokenizer implements ListIterator<String>, Cloneable {
098
099    // @formatter:off
100    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
101            .setDelimiterMatcher(StrMatcher.commaMatcher())
102            .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
103            .setIgnoredMatcher(StrMatcher.noneMatcher())
104            .setTrimmerMatcher(StrMatcher.trimMatcher())
105            .setEmptyTokenAsNull(false)
106            .setIgnoreEmptyTokens(false);
107    // @formatter:on
108
109    // @formatter:off
110    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
111            .setDelimiterMatcher(StrMatcher.tabMatcher())
112            .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
113            .setIgnoredMatcher(StrMatcher.noneMatcher())
114            .setTrimmerMatcher(StrMatcher.trimMatcher())
115            .setEmptyTokenAsNull(false)
116            .setIgnoreEmptyTokens(false);
117    // @formatter:on
118
119    /**
120     * Gets a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
121     *
122     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
123     */
124    private static StrTokenizer getCSVClone() {
125        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
126    }
127
128    /**
129     * Gets a new tokenizer instance which parses Comma Separated Value strings
130     * initializing it with the given input.  The default for CSV processing
131     * will be trim whitespace from both ends (which can be overridden with
132     * the setTrimmer method).
133     * <p>
134     * You must call a "reset" method to set the string which you want to parse.
135     * </p>
136     *
137     * @return a new tokenizer instance which parses Comma Separated Value strings.
138     */
139    public static StrTokenizer getCSVInstance() {
140        return getCSVClone();
141    }
142
143    /**
144     * Gets a new tokenizer instance which parses Comma Separated Value strings
145     * initializing it with the given input.  The default for CSV processing
146     * will be trim whitespace from both ends (which can be overridden with
147     * the setTrimmer method).
148     *
149     * @param input  the text to parse.
150     * @return a new tokenizer instance which parses Comma Separated Value strings.
151     */
152    public static StrTokenizer getCSVInstance(final char[] input) {
153        final StrTokenizer tok = getCSVClone();
154        tok.reset(input);
155        return tok;
156    }
157
158    /**
159     * Gets a new tokenizer instance which parses Comma Separated Value strings
160     * initializing it with the given input.  The default for CSV processing
161     * will be trim whitespace from both ends (which can be overridden with
162     * the setTrimmer method).
163     *
164     * @param input  the text to parse.
165     * @return a new tokenizer instance which parses Comma Separated Value strings.
166     */
167    public static StrTokenizer getCSVInstance(final String input) {
168        final StrTokenizer tok = getCSVClone();
169        tok.reset(input);
170        return tok;
171    }
172
173    /**
174     * Gets a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
175     *
176     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
177     */
178    private static StrTokenizer getTSVClone() {
179        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
180    }
181
182    /**
183     * Gets a new tokenizer instance which parses Tab Separated Value strings.
184     * The default for CSV processing will be trim whitespace from both ends
185     * (which can be overridden with the setTrimmer method).
186     * <p>
187     * You must call a "reset" method to set the string which you want to parse.
188     * </p>
189     *
190     * @return a new tokenizer instance which parses Tab Separated Value strings.
191     */
192    public static StrTokenizer getTSVInstance() {
193        return getTSVClone();
194    }
195
196    /**
197     * Gets a new tokenizer instance which parses Tab Separated Value strings.
198     * The default for CSV processing will be trim whitespace from both ends
199     * (which can be overridden with the setTrimmer method).
200     *
201     * @param input  the string to parse.
202     * @return a new tokenizer instance which parses Tab Separated Value strings.
203     */
204    public static StrTokenizer getTSVInstance(final char[] input) {
205        final StrTokenizer tok = getTSVClone();
206        tok.reset(input);
207        return tok;
208    }
209
210    /**
211     * Gets a new tokenizer instance which parses Tab Separated Value strings.
212     * The default for CSV processing will be trim whitespace from both ends
213     * (which can be overridden with the setTrimmer method).
214     *
215     * @param input  the string to parse.
216     * @return a new tokenizer instance which parses Tab Separated Value strings.
217     */
218    public static StrTokenizer getTSVInstance(final String input) {
219        final StrTokenizer tok = getTSVClone();
220        tok.reset(input);
221        return tok;
222    }
223
224    /** The text to work on. */
225    private char[] chars;
226
227    /** The parsed tokens */
228    private String[] tokens;
229
230    /** The current iteration position */
231    private int tokenPos;
232
233    /** The delimiter matcher */
234    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
235
236    /** The quote matcher */
237    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
238
239    /** The ignored matcher */
240    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
241
242    /** The trimmer matcher */
243    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
244
245    /** Whether to return empty tokens as null */
246    private boolean emptyAsNull;
247
248    /** Whether to ignore empty tokens */
249    private boolean ignoreEmptyTokens = true;
250
251    /**
252     * Constructs a tokenizer splitting on space, tab, newline and formfeed
253     * as per StringTokenizer, but with no text to tokenize.
254     * <p>
255     * This constructor is normally used with {@link #reset(String)}.
256     * </p>
257     */
258    public StrTokenizer() {
259        this.chars = null;
260    }
261
262    /**
263     * Constructs a tokenizer splitting on space, tab, newline and formfeed
264     * as per StringTokenizer.
265     *
266     * @param input  the string which is to be parsed, not cloned.
267     */
268    public StrTokenizer(final char[] input) {
269        this.chars = ArrayUtils.clone(input);
270    }
271
272    /**
273     * Constructs a tokenizer splitting on the specified character.
274     *
275     * @param input  the string which is to be parsed, not cloned.
276     * @param delim the field delimiter character.
277     */
278    public StrTokenizer(final char[] input, final char delim) {
279        this(input);
280        setDelimiterChar(delim);
281    }
282
283    /**
284     * Constructs a tokenizer splitting on the specified delimiter character
285     * and handling quotes using the specified quote character.
286     *
287     * @param input  the string which is to be parsed, not cloned.
288     * @param delim  the field delimiter character.
289     * @param quote  the field quoted string character.
290     */
291    public StrTokenizer(final char[] input, final char delim, final char quote) {
292        this(input, delim);
293        setQuoteChar(quote);
294    }
295
296    /**
297     * Constructs a tokenizer splitting on the specified string.
298     *
299     * @param input  the string which is to be parsed, not cloned.
300     * @param delim the field delimiter string.
301     */
302    public StrTokenizer(final char[] input, final String delim) {
303        this(input);
304        setDelimiterString(delim);
305    }
306
307    /**
308     * Constructs a tokenizer splitting using the specified delimiter matcher.
309     *
310     * @param input  the string which is to be parsed, not cloned.
311     * @param delim  the field delimiter matcher.
312     */
313    public StrTokenizer(final char[] input, final StrMatcher delim) {
314        this(input);
315        setDelimiterMatcher(delim);
316    }
317
318    /**
319     * Constructs a tokenizer splitting using the specified delimiter matcher
320     * and handling quotes using the specified quote matcher.
321     *
322     * @param input  the string which is to be parsed, not cloned.
323     * @param delim  the field delimiter character.
324     * @param quote  the field quoted string character.
325     */
326    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
327        this(input, delim);
328        setQuoteMatcher(quote);
329    }
330
331    /**
332     * Constructs a tokenizer splitting on space, tab, newline and formfeed
333     * as per StringTokenizer.
334     *
335     * @param input  the string which is to be parsed.
336     */
337    public StrTokenizer(final String input) {
338        if (input != null) {
339            chars = input.toCharArray();
340        } else {
341            chars = null;
342        }
343    }
344
345    /**
346     * Constructs a tokenizer splitting on the specified delimiter character.
347     *
348     * @param input  the string which is to be parsed.
349     * @param delim  the field delimiter character.
350     */
351    public StrTokenizer(final String input, final char delim) {
352        this(input);
353        setDelimiterChar(delim);
354    }
355
356    /**
357     * Constructs a tokenizer splitting on the specified delimiter character
358     * and handling quotes using the specified quote character.
359     *
360     * @param input  the string which is to be parsed.
361     * @param delim  the field delimiter character.
362     * @param quote  the field quoted string character.
363     */
364    public StrTokenizer(final String input, final char delim, final char quote) {
365        this(input, delim);
366        setQuoteChar(quote);
367    }
368
369    /**
370     * Constructs a tokenizer splitting on the specified delimiter string.
371     *
372     * @param input  the string which is to be parsed.
373     * @param delim  the field delimiter string.
374     */
375    public StrTokenizer(final String input, final String delim) {
376        this(input);
377        setDelimiterString(delim);
378    }
379
380    /**
381     * Constructs a tokenizer splitting using the specified delimiter matcher.
382     *
383     * @param input  the string which is to be parsed.
384     * @param delim  the field delimiter matcher.
385     */
386    public StrTokenizer(final String input, final StrMatcher delim) {
387        this(input);
388        setDelimiterMatcher(delim);
389    }
390
391    /**
392     * Constructs a tokenizer splitting using the specified delimiter matcher
393     * and handling quotes using the specified quote matcher.
394     *
395     * @param input  the string which is to be parsed.
396     * @param delim  the field delimiter matcher.
397     * @param quote  the field quoted string matcher.
398     */
399    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
400        this(input, delim);
401        setQuoteMatcher(quote);
402    }
403
404    /**
405     * Unsupported ListIterator operation.
406     *
407     * @param obj this parameter ignored.
408     * @throws UnsupportedOperationException always.
409     */
410    @Override
411    public void add(final String obj) {
412        throw new UnsupportedOperationException("add() is unsupported");
413    }
414
415    /**
416     * Adds a token to a list, paying attention to the parameters we've set.
417     *
418     * @param list  the list to add to.
419     * @param tok  the token to add.
420     */
421    private void addToken(final List<String> list, String tok) {
422        if (StringUtils.isEmpty(tok)) {
423            if (isIgnoreEmptyTokens()) {
424                return;
425            }
426            if (isEmptyTokenAsNull()) {
427                tok = null;
428            }
429        }
430        list.add(tok);
431    }
432
433    /**
434     * Checks if tokenization has been done, and if not then do it.
435     */
436    private void checkTokenized() {
437        if (tokens == null) {
438            if (chars == null) {
439                // still call tokenize as subclass may do some work
440                final List<String> split = tokenize(null, 0, 0);
441                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
442            } else {
443                final List<String> split = tokenize(chars, 0, chars.length);
444                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
445            }
446        }
447    }
448
449    /**
450     * Creates a new instance of this Tokenizer. The new instance is reset so
451     * that it will be at the start of the token list.
452     * If a {@link CloneNotSupportedException} is caught, return {@code null}.
453     *
454     * @return a new instance of this Tokenizer which has been reset.
455     */
456    @Override
457    public Object clone() {
458        try {
459            return cloneReset();
460        } catch (final CloneNotSupportedException ex) {
461            return null;
462        }
463    }
464
465    /**
466     * Creates a new instance of this Tokenizer. The new instance is reset so that
467     * it will be at the start of the token list.
468     *
469     * @return a new instance of this Tokenizer which has been reset.
470     * @throws CloneNotSupportedException if there is a problem cloning.
471     */
472    Object cloneReset() throws CloneNotSupportedException {
473        // this method exists to enable 100% test coverage
474        final StrTokenizer cloned = (StrTokenizer) super.clone();
475        if (cloned.chars != null) {
476            cloned.chars = cloned.chars.clone();
477        }
478        cloned.reset();
479        return cloned;
480    }
481
482    /**
483     * Gets the String content that the tokenizer is parsing.
484     *
485     * @return the string content being parsed.
486     */
487    public String getContent() {
488        if (chars == null) {
489            return null;
490        }
491        return new String(chars);
492    }
493
494    /**
495     * Gets the field delimiter matcher.
496     *
497     * @return the delimiter matcher in use.
498     */
499    public StrMatcher getDelimiterMatcher() {
500        return this.delimMatcher;
501    }
502
503    /**
504     * Gets the ignored character matcher.
505     * <p>
506     * These characters are ignored when parsing the String, unless they are
507     * within a quoted region.
508     * The default value is not to ignore anything.
509     * </p>
510     *
511     * @return the ignored matcher in use.
512     */
513    public StrMatcher getIgnoredMatcher() {
514        return ignoredMatcher;
515    }
516
517    /**
518     * Gets the quote matcher currently in use.
519     * <p>
520     * The quote character is used to wrap data between the tokens.
521     * This enables delimiters to be entered as data.
522     * The default value is '"' (double quote).
523     * </p>
524     *
525     * @return the quote matcher in use.
526     */
527    public StrMatcher getQuoteMatcher() {
528        return quoteMatcher;
529    }
530
531    /**
532     * Gets a copy of the full token list as an independent modifiable array.
533     *
534     * @return the tokens as a String array.
535     */
536    public String[] getTokenArray() {
537        checkTokenized();
538        return tokens.clone();
539    }
540
541    /**
542     * Gets a copy of the full token list as an independent modifiable list.
543     *
544     * @return the tokens as a String array.
545     */
546    public List<String> getTokenList() {
547        checkTokenized();
548        final List<String> list = new ArrayList<>(tokens.length);
549        list.addAll(Arrays.asList(tokens));
550        return list;
551    }
552
553    /**
554     * Gets the trimmer character matcher.
555     * <p>
556     * These characters are trimmed off on each side of the delimiter
557     * until the token or quote is found.
558     * The default value is not to trim anything.
559     * </p>
560     *
561     * @return the trimmer matcher in use.
562     */
563    public StrMatcher getTrimmerMatcher() {
564        return trimmerMatcher;
565    }
566
567    /**
568     * Checks whether there are any more tokens.
569     *
570     * @return true if there are more tokens.
571     */
572    @Override
573    public boolean hasNext() {
574        checkTokenized();
575        return tokenPos < tokens.length;
576    }
577
578    /**
579     * Checks whether there are any previous tokens that can be iterated to.
580     *
581     * @return true if there are previous tokens.
582     */
583    @Override
584    public boolean hasPrevious() {
585        checkTokenized();
586        return tokenPos > 0;
587    }
588
589    /**
590     * Gets whether the tokenizer currently returns empty tokens as null.
591     * The default for this property is false.
592     *
593     * @return true if empty tokens are returned as null.
594     */
595    public boolean isEmptyTokenAsNull() {
596        return this.emptyAsNull;
597    }
598
599    /**
600     * Gets whether the tokenizer currently ignores empty tokens.
601     * The default for this property is true.
602     *
603     * @return true if empty tokens are not returned.
604     */
605    public boolean isIgnoreEmptyTokens() {
606        return ignoreEmptyTokens;
607    }
608
609    /**
610     * Checks if the characters at the index specified match the quote
611     * already matched in readNextToken().
612     *
613     * @param srcChars  the character array being tokenized.
614     * @param pos  the position to check for a quote.
615     * @param len  the length of the character array being tokenized.
616     * @param quoteStart  the start position of the matched quote, 0 if no quoting.
617     * @param quoteLen  the length of the matched quote, 0 if no quoting.
618     * @return true if a quote is matched.
619     */
620    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
621        for (int i = 0; i < quoteLen; i++) {
622            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
623                return false;
624            }
625        }
626        return true;
627    }
628
629    /**
630     * Gets the next token.
631     *
632     * @return the next String token.
633     * @throws NoSuchElementException if there are no more elements.
634     */
635    @Override
636    public String next() {
637        if (hasNext()) {
638            return tokens[tokenPos++];
639        }
640        throw new NoSuchElementException();
641    }
642
643    /**
644     * Gets the index of the next token to return.
645     *
646     * @return the next token index.
647     */
648    @Override
649    public int nextIndex() {
650        return tokenPos;
651    }
652
653    /**
654     * Gets the next token from the String.
655     * Equivalent to {@link #next()} except it returns null rather than
656     * throwing {@link NoSuchElementException} when no tokens remain.
657     *
658     * @return the next sequential token, or null when no more tokens are found.
659     */
660    public String nextToken() {
661        if (hasNext()) {
662            return tokens[tokenPos++];
663        }
664        return null;
665    }
666
667    /**
668     * Gets the token previous to the last returned token.
669     *
670     * @return the previous token.
671     */
672    @Override
673    public String previous() {
674        if (hasPrevious()) {
675            return tokens[--tokenPos];
676        }
677        throw new NoSuchElementException();
678    }
679
680    /**
681     * Gets the index of the previous token.
682     *
683     * @return the previous token index.
684     */
685    @Override
686    public int previousIndex() {
687        return tokenPos - 1;
688    }
689
690    /**
691     * Gets the previous token from the String.
692     *
693     * @return the previous sequential token, or null when no more tokens are found.
694     */
695    public String previousToken() {
696        if (hasPrevious()) {
697            return tokens[--tokenPos];
698        }
699        return null;
700    }
701
702    /**
703     * Reads character by character through the String to get the next token.
704     *
705     * @param srcChars  the character array being tokenized.
706     * @param start  the first character of field.
707     * @param len  the length of the character array being tokenized.
708     * @param workArea  a temporary work area.
709     * @param tokenList  the list of parsed tokens.
710     * @return the starting position of the next field (the character
711     *  immediately after the delimiter), or -1 if end of string found.
712     */
713    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
714        // skip all leading whitespace, unless it is the
715        // field delimiter or the quote character
716        while (start < len) {
717            final int removeLen = Math.max(
718                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
719                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
720            if (removeLen == 0 ||
721                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
722                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
723                break;
724            }
725            start += removeLen;
726        }
727
728        // handle reaching end
729        if (start >= len) {
730            addToken(tokenList, StringUtils.EMPTY);
731            return -1;
732        }
733
734        // handle empty token
735        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
736        if (delimLen > 0) {
737            addToken(tokenList, StringUtils.EMPTY);
738            return start + delimLen;
739        }
740
741        // handle found token
742        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
743        if (quoteLen > 0) {
744            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
745        }
746        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
747    }
748
749    /**
750     * Reads a possibly quoted string token.
751     *
752     * @param srcChars  the character array being tokenized.
753     * @param start  the first character of field.
754     * @param len  the length of the character array being tokenized.
755     * @param workArea  a temporary work area.
756     * @param tokenList  the list of parsed tokens.
757     * @param quoteStart  the start position of the matched quote, 0 if no quoting.
758     * @param quoteLen  the length of the matched quote, 0 if no quoting.
759     * @return the starting position of the next field (the character
760     *  immediately after the delimiter, or if end of string found,
761     *  then the length of string.
762     */
763    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
764                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
765        // Loop until we've found the end of the quoted
766        // string or the end of the input
767        workArea.clear();
768        int pos = start;
769        boolean quoting = quoteLen > 0;
770        int trimStart = 0;
771
772        while (pos < len) {
773            // quoting mode can occur several times throughout a string
774            // we must switch between quoting and non-quoting until we
775            // encounter a non-quoted delimiter, or end of string
776            if (quoting) {
777                // In quoting mode
778
779                // If we've found a quote character, see if it's
780                // followed by a second quote.  If so, then we need
781                // to actually put the quote character into the token
782                // rather than end the token.
783                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
784                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
785                        // matched pair of quotes, thus an escaped quote
786                        workArea.append(srcChars, pos, quoteLen);
787                        pos += quoteLen * 2;
788                        trimStart = workArea.size();
789                        continue;
790                    }
791
792                    // end of quoting
793                    quoting = false;
794                    pos += quoteLen;
795                    continue;
796                }
797
798            } else {
799                // Not in quoting mode
800
801                // check for delimiter, and thus end of token
802                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
803                if (delimLen > 0) {
804                    // return condition when end of token found
805                    addToken(tokenList, workArea.substring(0, trimStart));
806                    return pos + delimLen;
807                }
808
809                // check for quote, and thus back into quoting mode
810                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
811                    quoting = true;
812                    pos += quoteLen;
813                    continue;
814                }
815
816                // check for ignored (outside quotes), and ignore
817                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
818                if (ignoredLen > 0) {
819                    pos += ignoredLen;
820                    continue;
821                }
822
823                // check for trimmed character
824                // don't yet know if it's at the end, so copy to workArea
825                // use trimStart to keep track of trim at the end
826                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
827                if (trimmedLen > 0) {
828                    workArea.append(srcChars, pos, trimmedLen);
829                    pos += trimmedLen;
830                    continue;
831                }
832            }
833            // copy regular character from inside quotes
834            workArea.append(srcChars[pos++]);
835            trimStart = workArea.size();
836        }
837
838        // return condition when end of string found
839        addToken(tokenList, workArea.substring(0, trimStart));
840        return -1;
841    }
842
843    /**
844     * Unsupported ListIterator operation.
845     *
846     * @throws UnsupportedOperationException always.
847     */
848    @Override
849    public void remove() {
850        throw new UnsupportedOperationException("remove() is unsupported");
851    }
852
853    /**
854     * Resets this tokenizer, forgetting all parsing and iteration already completed.
855     * <p>
856     * This method allows the same tokenizer to be reused for the same String.
857     * </p>
858     *
859     * @return {@code this} instance.
860     */
861    public StrTokenizer reset() {
862        tokenPos = 0;
863        tokens = null;
864        return this;
865    }
866
867    /**
868     * Reset this tokenizer, giving it a new input string to parse.
869     * In this manner you can re-use a tokenizer with the same settings
870     * on multiple input lines.
871     *
872     * @param input  the new character array to tokenize, not cloned, null sets no text to parse.
873     * @return {@code this} instance.
874     */
875    public StrTokenizer reset(final char[] input) {
876        reset();
877        this.chars = ArrayUtils.clone(input);
878        return this;
879    }
880
881    /**
882     * Reset this tokenizer, giving it a new input string to parse.
883     * In this manner you can re-use a tokenizer with the same settings
884     * on multiple input lines.
885     *
886     * @param input  the new string to tokenize, null sets no text to parse.
887     * @return {@code this} instance.
888     */
889    public StrTokenizer reset(final String input) {
890        reset();
891        if (input != null) {
892            this.chars = input.toCharArray();
893        } else {
894            this.chars = null;
895        }
896        return this;
897    }
898
899    /**
900     * Unsupported ListIterator operation.
901     *
902     * @param obj this parameter ignored.
903     * @throws UnsupportedOperationException always.
904     */
905    @Override
906    public void set(final String obj) {
907        throw new UnsupportedOperationException("set() is unsupported");
908    }
909
910    /**
911     * Sets the field delimiter character.
912     *
913     * @param delim  the delimiter character to use.
914     * @return {@code this} instance.
915     */
916    public StrTokenizer setDelimiterChar(final char delim) {
917        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
918    }
919
920    /**
921     * Sets the field delimiter matcher.
922     * <p>
923     * The delimiter is used to separate one token from another.
924     * </p>
925     *
926     * @param delim  the delimiter matcher to use.
927     * @return {@code this} instance.
928     */
929    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
930        if (delim == null) {
931            this.delimMatcher = StrMatcher.noneMatcher();
932        } else {
933            this.delimMatcher = delim;
934        }
935        return this;
936    }
937
938    /**
939     * Sets the field delimiter string.
940     *
941     * @param delim  the delimiter string to use.
942     * @return {@code this} instance.
943     */
944    public StrTokenizer setDelimiterString(final String delim) {
945        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
946    }
947
948    /**
949     * Sets whether the tokenizer should return empty tokens as null.
950     * The default for this property is false.
951     *
952     * @param emptyAsNull  whether empty tokens are returned as null.
953     * @return {@code this} instance.
954     */
955    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
956        this.emptyAsNull = emptyAsNull;
957        return this;
958    }
959
960    /**
961     * Sets the character to ignore.
962     * <p>
963     * This character is ignored when parsing the String, unless it is
964     * within a quoted region.
965     *
966     * @param ignored  the ignored character to use.
967     * @return {@code this} instance.
968     */
969    public StrTokenizer setIgnoredChar(final char ignored) {
970        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
971    }
972
973    /**
974     * Sets the matcher for characters to ignore.
975     * <p>
976     * These characters are ignored when parsing the String, unless they are
977     * within a quoted region.
978     * </p>
979     *
980     * @param ignored  the ignored matcher to use, null ignored.
981     * @return {@code this} instance.
982     */
983    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
984        if (ignored != null) {
985            this.ignoredMatcher = ignored;
986        }
987        return this;
988    }
989
990    /**
991     * Sets whether the tokenizer should ignore and not return empty tokens.
992     * The default for this property is true.
993     *
994     * @param ignoreEmptyTokens  whether empty tokens are not returned.
995     * @return {@code this} instance.
996     */
997    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
998        this.ignoreEmptyTokens = ignoreEmptyTokens;
999        return this;
1000    }
1001
1002    /**
1003     * Sets the quote character to use.
1004     * <p>
1005     * The quote character is used to wrap data between the tokens.
1006     * This enables delimiters to be entered as data.
1007     * </p>
1008     *
1009     * @param quote  the quote character to use.
1010     * @return {@code this} instance.
1011     */
1012    public StrTokenizer setQuoteChar(final char quote) {
1013        return setQuoteMatcher(StrMatcher.charMatcher(quote));
1014    }
1015
1016    /**
1017     * Sets the quote matcher to use.
1018     * <p>
1019     * The quote character is used to wrap data between the tokens.
1020     * This enables delimiters to be entered as data.
1021     * </p>
1022     *
1023     * @param quote  the quote matcher to use, null ignored.
1024     * @return {@code this} instance.
1025     */
1026    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1027        if (quote != null) {
1028            this.quoteMatcher = quote;
1029        }
1030        return this;
1031    }
1032
1033    /**
1034     * Sets the matcher for characters to trim.
1035     * <p>
1036     * These characters are trimmed off on each side of the delimiter
1037     * until the token or quote is found.
1038     * </p>
1039     *
1040     * @param trimmer  the trimmer matcher to use, null ignored.
1041     * @return {@code this} instance.
1042     */
1043    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1044        if (trimmer != null) {
1045            this.trimmerMatcher = trimmer;
1046        }
1047        return this;
1048    }
1049
1050    /**
1051     * Gets the number of tokens found in the String.
1052     *
1053     * @return the number of matched tokens.
1054     */
1055    public int size() {
1056        checkTokenized();
1057        return tokens.length;
1058    }
1059
1060    /**
1061     * Internal method to performs the tokenization.
1062     * <p>
1063     * Most users of this class do not need to call this method. This method
1064     * will be called automatically by other (public) methods when required.
1065     * </p>
1066     * <p>
1067     * This method exists to allow subclasses to add code before or after the
1068     * tokenization. For example, a subclass could alter the character array,
1069     * offset or count to be parsed, or call the tokenizer multiple times on
1070     * multiple strings. It is also be possible to filter the results.
1071     * </p>
1072     * <p>
1073     * {@link StrTokenizer} will always pass a zero offset and a count
1074     * equal to the length of the array to this method, however a subclass
1075     * may pass other values, or even an entirely different array.
1076     * </p>
1077     *
1078     * @param srcChars  the character array being tokenized, may be null.
1079     * @param offset  the start position within the character array, must be valid.
1080     * @param count  the number of characters to tokenize, must be valid.
1081     * @return the modifiable list of String tokens, unmodifiable if null array or zero count.
1082     */
1083    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1084        if (ArrayUtils.isEmpty(srcChars)) {
1085            return Collections.emptyList();
1086        }
1087        final StrBuilder buf = new StrBuilder();
1088        final List<String> tokenList = new ArrayList<>();
1089        int pos = offset;
1090
1091        // loop around the entire buffer
1092        while (pos >= 0 && pos < count) {
1093            // find next token
1094            pos = readNextToken(srcChars, pos, count, buf, tokenList);
1095
1096            // handle case where end of string is a delimiter
1097            if (pos >= count) {
1098                addToken(tokenList, StringUtils.EMPTY);
1099            }
1100        }
1101        return tokenList;
1102    }
1103
1104    /**
1105     * Gets the String content that the tokenizer is parsing.
1106     *
1107     * @return the string content being parsed.
1108     */
1109    @Override
1110    public String toString() {
1111        if (tokens == null) {
1112            return "StrTokenizer[not tokenized yet]";
1113        }
1114        return "StrTokenizer" + getTokenList();
1115    }
1116
1117}