Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text;
018
019import java.util.ArrayList;
020import java.util.Collections;
021import java.util.List;
022import java.util.ListIterator;
023import java.util.NoSuchElementException;
024
025import org.apache.commons.lang3.ArrayUtils;
026import org.apache.commons.lang3.StringUtils;
027
028/**
029 * Tokenizes a string based on delimiters (separators)
030 * and supporting quoting and ignored character concepts.
031 * <p>
032 * This class can split a String into many smaller strings. It aims
033 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
034 * however it offers much more control and flexibility including implementing
035 * the {@code ListIterator} interface. By default, it is set up
036 * like {@code StringTokenizer}.
037 * <p>
038 * The input String is split into a number of <em>tokens</em>.
039 * Each token is separated from the next String by a <em>delimiter</em>.
040 * One or more delimiter characters must be specified.
041 * <p>
042 * Each token may be surrounded by quotes.
043 * The <em>quote</em> matcher specifies the quote character(s).
044 * A quote may be escaped within a quoted section by duplicating itself.
045 * <p>
046 * Between each token and the delimiter are potentially characters that need trimming.
047 * The <em>trimmer</em> matcher specifies these characters.
048 * One usage might be to trim whitespace characters.
049 * <p>
050 * At any point outside the quotes there might potentially be invalid characters.
051 * The <em>ignored</em> matcher specifies these characters to be removed.
052 * One usage might be to remove new line characters.
053 * <p>
054 * Empty tokens may be removed or returned as null.
055 * <pre>
056 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
057 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
058 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
059 * </pre>
060 *
061 * <table>
062 *  <caption>StrTokenizer properties and options</caption>
063 *  <tr>
064 *   <th>Property</th><th>Type</th><th>Default</th>
065 *  </tr>
066 *  <tr>
067 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
068 *  </tr>
069 *  <tr>
070 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
071 *  </tr>
072 *  <tr>
073 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
074 *  </tr>
075 *  <tr>
076 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
077 *  </tr>
078 *  <tr>
079 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
080 *  </tr>
081 * </table>
082 *
083 * @since 1.0
084 * @deprecated Deprecated as of 1.3, use {@link StringTokenizer} instead. This class will be removed in 2.0.
085 */
086@Deprecated
087public class StrTokenizer implements ListIterator<String>, Cloneable {
088
089    /** Comma separated values tokenizer internal variable. */
090    // @formatter:off
091    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
092            .setDelimiterMatcher(StrMatcher.commaMatcher())
093            .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
094            .setIgnoredMatcher(StrMatcher.noneMatcher())
095            .setTrimmerMatcher(StrMatcher.trimMatcher())
096            .setEmptyTokenAsNull(false)
097            .setIgnoreEmptyTokens(false);
098    // @formatter:on
099
100    /** Tab separated values tokenizer internal variable. */
101    // @formatter:off
102    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE = new StrTokenizer()
103            .setDelimiterMatcher(StrMatcher.tabMatcher())
104            .setQuoteMatcher(StrMatcher.doubleQuoteMatcher())
105            .setIgnoredMatcher(StrMatcher.noneMatcher())
106            .setTrimmerMatcher(StrMatcher.trimMatcher())
107            .setEmptyTokenAsNull(false)
108            .setIgnoreEmptyTokens(false);
109    // @formatter:on
110
111    /**
112     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
113     *
114     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
115     */
116    private static StrTokenizer getCSVClone() {
117        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
118    }
119
120    /**
121     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
122     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
123     * <p>
124     * You must call a "reset" method to set the string which you want to parse.
125     * </p>
126     *
127     * @return a new tokenizer instance which parses Comma Separated Value strings.
128     */
129    public static StrTokenizer getCSVInstance() {
130        return getCSVClone();
131    }
132
133    /**
134     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
135     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
136     *
137     * @param input the text to parse.
138     * @return a new tokenizer instance which parses Comma Separated Value strings.
139     */
140    public static StrTokenizer getCSVInstance(final char[] input) {
141        final StrTokenizer tok = getCSVClone();
142        tok.reset(input);
143        return tok;
144    }
145
146    /**
147     * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. The default for CSV processing will be
148     * trim whitespace from both ends (which can be overridden with the setTrimmer method).
149     *
150     * @param input the text to parse.
151     * @return a new tokenizer instance which parses Comma Separated Value strings.
152     */
153    public static StrTokenizer getCSVInstance(final String input) {
154        final StrTokenizer tok = getCSVClone();
155        tok.reset(input);
156        return tok;
157    }
158
159    /**
160     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
161     *
162     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
163     */
164    private static StrTokenizer getTSVClone() {
165        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
166    }
167
168    /**
169     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
170     * be overridden with the setTrimmer method).
171     * <p>
172     * You must call a "reset" method to set the string which you want to parse.
173     * </p>
174     *
175     * @return a new tokenizer instance which parses Tab Separated Value strings.
176     */
177    public static StrTokenizer getTSVInstance() {
178        return getTSVClone();
179    }
180
181    /**
182     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
183     * be overridden with the setTrimmer method).
184     *
185     * @param input the string to parse.
186     * @return a new tokenizer instance which parses Tab Separated Value strings.
187     */
188    public static StrTokenizer getTSVInstance(final char[] input) {
189        final StrTokenizer tok = getTSVClone();
190        tok.reset(input);
191        return tok;
192    }
193
194    /**
195     * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be trim whitespace from both ends (which can
196     * be overridden with the setTrimmer method).
197     *
198     * @param input the string to parse.
199     * @return a new tokenizer instance which parses Tab Separated Value strings.
200     */
201    public static StrTokenizer getTSVInstance(final String input) {
202        final StrTokenizer tok = getTSVClone();
203        tok.reset(input);
204        return tok;
205    }
206
207    /** The text to work on. */
208    private char[] chars;
209
210    /** The parsed tokens. */
211    private String[] tokens;
212
213    /** The current iteration position. */
214    private int tokenPos;
215
216    /** The delimiter matcher. */
217    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
218
219    /** The quote matcher. */
220    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
221
222    /** The ignored matcher. */
223    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
224
225    /** The trimmer matcher. */
226    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
227
228    /** Whether to return empty tokens as null. */
229    private boolean emptyAsNull;
230
231    /** Whether to ignore empty tokens. */
232    private boolean ignoreEmptyTokens = true;
233
234    /**
235     * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to tokenize.
236     * <p>
237     * This constructor is normally used with {@link #reset(String)}.
238     * </p>
239     */
240    public StrTokenizer() {
241        this.chars = null;
242    }
243
244    /**
245     * Constructs a tokenizer splitting on space, tab, newline and form feed
246     * as per StringTokenizer.
247     *
248     * @param input  the string which is to be parsed, not cloned.
249     */
250    public StrTokenizer(final char[] input) {
251        if (input == null) {
252            this.chars = null;
253        } else {
254            this.chars = input.clone();
255        }
256    }
257
258    /**
259     * Constructs a tokenizer splitting on the specified character.
260     *
261     * @param input  the string which is to be parsed, not cloned.
262     * @param delim the field delimiter character.
263     */
264    public StrTokenizer(final char[] input, final char delim) {
265        this(input);
266        setDelimiterChar(delim);
267    }
268
269    /**
270     * Constructs a tokenizer splitting on the specified delimiter character
271     * and handling quotes using the specified quote character.
272     *
273     * @param input  the string which is to be parsed, not cloned.
274     * @param delim  the field delimiter character.
275     * @param quote  the field quoted string character.
276     */
277    public StrTokenizer(final char[] input, final char delim, final char quote) {
278        this(input, delim);
279        setQuoteChar(quote);
280    }
281
282    /**
283     * Constructs a tokenizer splitting on the specified string.
284     *
285     * @param input  the string which is to be parsed, not cloned.
286     * @param delim the field delimiter string.
287     */
288    public StrTokenizer(final char[] input, final String delim) {
289        this(input);
290        setDelimiterString(delim);
291    }
292
293    /**
294     * Constructs a tokenizer splitting using the specified delimiter matcher.
295     *
296     * @param input  the string which is to be parsed, not cloned.
297     * @param delim  the field delimiter matcher.
298     */
299    public StrTokenizer(final char[] input, final StrMatcher delim) {
300        this(input);
301        setDelimiterMatcher(delim);
302    }
303
304    /**
305     * Constructs a tokenizer splitting using the specified delimiter matcher
306     * and handling quotes using the specified quote matcher.
307     *
308     * @param input  the string which is to be parsed, not cloned.
309     * @param delim  the field delimiter character.
310     * @param quote  the field quoted string character.
311     */
312    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
313        this(input, delim);
314        setQuoteMatcher(quote);
315    }
316
317    /**
318     * Constructs a tokenizer splitting on space, tab, newline and form feed
319     * as per StringTokenizer.
320     *
321     * @param input  the string which is to be parsed.
322     */
323    public StrTokenizer(final String input) {
324        if (input != null) {
325            chars = input.toCharArray();
326        } else {
327            chars = null;
328        }
329    }
330
331    /**
332     * Constructs a tokenizer splitting on the specified delimiter character.
333     *
334     * @param input  the string which is to be parsed.
335     * @param delim  the field delimiter character.
336     */
337    public StrTokenizer(final String input, final char delim) {
338        this(input);
339        setDelimiterChar(delim);
340    }
341
342    /**
343     * Constructs a tokenizer splitting on the specified delimiter character
344     * and handling quotes using the specified quote character.
345     *
346     * @param input  the string which is to be parsed.
347     * @param delim  the field delimiter character.
348     * @param quote  the field quoted string character.
349     */
350    public StrTokenizer(final String input, final char delim, final char quote) {
351        this(input, delim);
352        setQuoteChar(quote);
353    }
354
355    /**
356     * Constructs a tokenizer splitting on the specified delimiter string.
357     *
358     * @param input  the string which is to be parsed.
359     * @param delim  the field delimiter string.
360     */
361    public StrTokenizer(final String input, final String delim) {
362        this(input);
363        setDelimiterString(delim);
364    }
365
366    /**
367     * Constructs a tokenizer splitting using the specified delimiter matcher.
368     *
369     * @param input  the string which is to be parsed.
370     * @param delim  the field delimiter matcher.
371     */
372    public StrTokenizer(final String input, final StrMatcher delim) {
373        this(input);
374        setDelimiterMatcher(delim);
375    }
376
377    /**
378     * Constructs a tokenizer splitting using the specified delimiter matcher
379     * and handling quotes using the specified quote matcher.
380     *
381     * @param input  the string which is to be parsed.
382     * @param delim  the field delimiter matcher.
383     * @param quote  the field quoted string matcher.
384     */
385    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
386        this(input, delim);
387        setQuoteMatcher(quote);
388    }
389
390    /**
391     * Unsupported ListIterator operation.
392     *
393     * @param obj this parameter ignored.
394     * @throws UnsupportedOperationException always.
395     */
396    @Override
397    public void add(final String obj) {
398        throw new UnsupportedOperationException("add() is unsupported");
399    }
400
401    /**
402     * Adds a token to a list, paying attention to the parameters we've set.
403     *
404     * @param list  the list to add to.
405     * @param tok  the token to add.
406     */
407    private void addToken(final List<String> list, String tok) {
408        if (tok == null || tok.isEmpty()) {
409            if (isIgnoreEmptyTokens()) {
410                return;
411            }
412            if (isEmptyTokenAsNull()) {
413                tok = null;
414            }
415        }
416        list.add(tok);
417    }
418
419    /**
420     * Checks if tokenization has been done, and if not then do it.
421     */
422    private void checkTokenized() {
423        if (tokens == null) {
424            if (chars == null) {
425                // still call tokenize as subclass may do some work
426                final List<String> split = tokenize(null, 0, 0);
427                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
428            } else {
429                final List<String> split = tokenize(chars, 0, chars.length);
430                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
431            }
432        }
433    }
434
435    /**
436     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list. If a
437     * {@link CloneNotSupportedException} is caught, return {@code null}.
438     *
439     * @return a new instance of this Tokenizer which has been reset.
440     */
441    @Override
442    public Object clone() {
443        try {
444            return cloneReset();
445        } catch (final CloneNotSupportedException ex) {
446            return null;
447        }
448    }
449
450    /**
451     * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token list.
452     *
453     * @return a new instance of this Tokenizer which has been reset.
454     * @throws CloneNotSupportedException if there is a problem cloning.
455     */
456    Object cloneReset() throws CloneNotSupportedException {
457        // this method exists to enable 100% test coverage
458        final StrTokenizer cloned = (StrTokenizer) super.clone();
459        if (cloned.chars != null) {
460            cloned.chars = cloned.chars.clone();
461        }
462        cloned.reset();
463        return cloned;
464    }
465
466    /**
467     * Gets the String content that the tokenizer is parsing.
468     *
469     * @return The string content being parsed.
470     */
471    public String getContent() {
472        if (chars == null) {
473            return null;
474        }
475        return new String(chars);
476    }
477
478    /**
479     * Gets the field delimiter matcher.
480     *
481     * @return The delimiter matcher in use.
482     */
483    public StrMatcher getDelimiterMatcher() {
484        return this.delimMatcher;
485    }
486
487    /**
488     * Gets the ignored character matcher.
489     * <p>
490     * These characters are ignored when parsing the String, unless they are within a quoted region. The default value is not to ignore anything.
491     * </p>
492     *
493     * @return The ignored matcher in use.
494     */
495    public StrMatcher getIgnoredMatcher() {
496        return ignoredMatcher;
497    }
498
499    /**
500     * Gets the quote matcher currently in use.
501     * <p>
502     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The default value is '"' (double quote).
503     * </p>
504     *
505     * @return The quote matcher in use.
506     */
507    public StrMatcher getQuoteMatcher() {
508        return quoteMatcher;
509    }
510
511    /**
512     * Gets a copy of the full token list as an independent modifiable array.
513     *
514     * @return The tokens as a String array.
515     */
516    public String[] getTokenArray() {
517        checkTokenized();
518        return tokens.clone();
519    }
520
521    /**
522     * Gets a copy of the full token list as an independent modifiable list.
523     *
524     * @return The tokens as a String array.
525     */
526    public List<String> getTokenList() {
527        checkTokenized();
528        final List<String> list = new ArrayList<>(tokens.length);
529        Collections.addAll(list, tokens);
530
531        return list;
532    }
533
534    /**
535     * Gets the trimmer character matcher.
536     * <p>
537     * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default value is not to trim anything.
538     * </p>
539     *
540     * @return The trimmer matcher in use.
541     */
542    public StrMatcher getTrimmerMatcher() {
543        return trimmerMatcher;
544    }
545
546    /**
547     * Checks whether there are any more tokens.
548     *
549     * @return true if there are more tokens.
550     */
551    @Override
552    public boolean hasNext() {
553        checkTokenized();
554        return tokenPos < tokens.length;
555    }
556
557    /**
558     * Checks whether there are any previous tokens that can be iterated to.
559     *
560     * @return true if there are previous tokens.
561     */
562    @Override
563    public boolean hasPrevious() {
564        checkTokenized();
565        return tokenPos > 0;
566    }
567
568    /**
569     * Gets whether the tokenizer currently returns empty tokens as null.
570     * The default for this property is false.
571     *
572     * @return true if empty tokens are returned as null.
573     */
574    public boolean isEmptyTokenAsNull() {
575        return this.emptyAsNull;
576    }
577
578    /**
579     * Gets whether the tokenizer currently ignores empty tokens.
580     * The default for this property is true.
581     *
582     * @return true if empty tokens are not returned.
583     */
584    public boolean isIgnoreEmptyTokens() {
585        return ignoreEmptyTokens;
586    }
587
588    /**
589     * Checks if the characters at the index specified match the quote
590     * already matched in readNextToken().
591     *
592     * @param srcChars  the character array being tokenized.
593     * @param pos  the position to check for a quote.
594     * @param len  the length of the character array being tokenized.
595     * @param quoteStart  the start position of the matched quote, 0 if no quoting.
596     * @param quoteLen  the length of the matched quote, 0 if no quoting.
597     * @return true if a quote is matched.
598     */
599    private boolean isQuote(final char[] srcChars,
600                            final int pos,
601                            final int len,
602                            final int quoteStart,
603                            final int quoteLen) {
604        for (int i = 0; i < quoteLen; i++) {
605            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
606                return false;
607            }
608        }
609        return true;
610    }
611
612    /**
613     * Gets the next token.
614     *
615     * @return The next String token.
616     * @throws NoSuchElementException if there are no more elements.
617     */
618    @Override
619    public String next() {
620        if (hasNext()) {
621            return tokens[tokenPos++];
622        }
623        throw new NoSuchElementException();
624    }
625
626    /**
627     * Gets the index of the next token to return.
628     *
629     * @return The next token index.
630     */
631    @Override
632    public int nextIndex() {
633        return tokenPos;
634    }
635
636    /**
637     * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing {@link NoSuchElementException} when no
638     * tokens remain.
639     *
640     * @return The next sequential token, or null when no more tokens are found.
641     */
642    public String nextToken() {
643        if (hasNext()) {
644            return tokens[tokenPos++];
645        }
646        return null;
647    }
648
649    /**
650     * Gets the token previous to the last returned token.
651     *
652     * @return The previous token.
653     */
654    @Override
655    public String previous() {
656        if (hasPrevious()) {
657            return tokens[--tokenPos];
658        }
659        throw new NoSuchElementException();
660    }
661
662    /**
663     * Gets the index of the previous token.
664     *
665     * @return The previous token index.
666     */
667    @Override
668    public int previousIndex() {
669        return tokenPos - 1;
670    }
671
672    /**
673     * Gets the previous token from the String.
674     *
675     * @return The previous sequential token, or null when no more tokens are found.
676     */
677    public String previousToken() {
678        if (hasPrevious()) {
679            return tokens[--tokenPos];
680        }
681        return null;
682    }
683
684    /**
685     * Reads character by character through the String to get the next token.
686     *
687     * @param srcChars  the character array being tokenized.
688     * @param start     the first character of field.
689     * @param len       the length of the character array being tokenized.
690     * @param workArea  a temporary work area.
691     * @param tokenList the list of parsed tokens.
692     * @return The starting position of the next field (the character immediately after the delimiter), or -1 if end of string found.
693     */
694    private int readNextToken(final char[] srcChars,
695                              int start,
696                              final int len,
697                              final StrBuilder workArea,
698                              final List<String> tokenList) {
699        // skip all leading whitespace, unless it is the
700        // field delimiter or the quote character
701        while (start < len) {
702            final int removeLen = Math.max(
703                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
704                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
705            if (removeLen == 0
706                    || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0
707                    || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
708                break;
709            }
710            start += removeLen;
711        }
712
713        // handle reaching end
714        if (start >= len) {
715            addToken(tokenList, StringUtils.EMPTY);
716            return -1;
717        }
718
719        // handle empty token
720        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
721        if (delimLen > 0) {
722            addToken(tokenList, StringUtils.EMPTY);
723            return start + delimLen;
724        }
725
726        // handle found token
727        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
728        if (quoteLen > 0) {
729            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
730        }
731        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
732    }
733
734    /**
735     * Reads a possibly quoted string token.
736     *
737     * @param srcChars   the character array being tokenized.
738     * @param start      the first character of field.
739     * @param len        the length of the character array being tokenized.
740     * @param workArea   a temporary work area.
741     * @param tokenList  the list of parsed tokens.
742     * @param quoteStart the start position of the matched quote, 0 if no quoting.
743     * @param quoteLen   the length of the matched quote, 0 if no quoting.
744     * @return The starting position of the next field (the character immediately after the delimiter, or if end of string found, then the length of string.
745     */
746    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
747                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
748        // Loop until we've found the end of the quoted
749        // string or the end of the input
750        workArea.clear();
751        int pos = start;
752        boolean quoting = quoteLen > 0;
753        int trimStart = 0;
754
755        while (pos < len) {
756            // quoting mode can occur several times throughout a string
757            // we must switch between quoting and non-quoting until we
758            // encounter a non-quoted delimiter, or end of string
759            if (quoting) {
760                // In quoting mode
761
762                // If we've found a quote character, see if it's
763                // followed by a second quote.  If so, then we need
764                // to actually put the quote character into the token
765                // rather than end the token.
766                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
767                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
768                        // matched pair of quotes, thus an escaped quote
769                        workArea.append(srcChars, pos, quoteLen);
770                        pos += quoteLen * 2;
771                        trimStart = workArea.size();
772                        continue;
773                    }
774
775                    // end of quoting
776                    quoting = false;
777                    pos += quoteLen;
778                    continue;
779                }
780
781            } else {
782                // Not in quoting mode
783
784                // check for delimiter, and thus end of token
785                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
786                if (delimLen > 0) {
787                    // return condition when end of token found
788                    addToken(tokenList, workArea.substring(0, trimStart));
789                    return pos + delimLen;
790                }
791
792                // check for quote, and thus back into quoting mode
793                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
794                    quoting = true;
795                    pos += quoteLen;
796                    continue;
797                }
798
799                // check for ignored (outside quotes), and ignore
800                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
801                if (ignoredLen > 0) {
802                    pos += ignoredLen;
803                    continue;
804                }
805
806                // check for trimmed character
807                // don't yet know if its at the end, so copy to workArea
808                // use trimStart to keep track of trim at the end
809                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
810                if (trimmedLen > 0) {
811                    workArea.append(srcChars, pos, trimmedLen);
812                    pos += trimmedLen;
813                    continue;
814                }
815
816            }
817            // copy regular character from inside quotes
818            workArea.append(srcChars[pos++]);
819            trimStart = workArea.size();
820        }
821
822        // return condition when end of string found
823        addToken(tokenList, workArea.substring(0, trimStart));
824        return -1;
825    }
826
827    /**
828     * Unsupported ListIterator operation.
829     *
830     * @throws UnsupportedOperationException always.
831     */
832    @Override
833    public void remove() {
834        throw new UnsupportedOperationException("remove() is unsupported");
835    }
836
837    /**
838     * Resets this tokenizer, forgetting all parsing and iteration already completed.
839     * <p>
840     * This method allows the same tokenizer to be reused for the same String.
841     * </p>
842     *
843     * @return {@code this} instance.
844     */
845    public StrTokenizer reset() {
846        tokenPos = 0;
847        tokens = null;
848        return this;
849    }
850
851    /**
852     * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
853     *
854     * @param input the new character array to tokenize, not cloned, null sets no text to parse.
855     * @return {@code this} instance.
856     */
857    public StrTokenizer reset(final char[] input) {
858        reset();
859        if (input != null) {
860            this.chars = input.clone();
861        } else {
862            this.chars = null;
863        }
864        return this;
865    }
866
867    /**
868     * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the same settings on multiple input lines.
869     *
870     * @param input the new string to tokenize, null sets no text to parse.
871     * @return {@code this} instance.
872     */
873    public StrTokenizer reset(final String input) {
874        reset();
875        if (input != null) {
876            this.chars = input.toCharArray();
877        } else {
878            this.chars = null;
879        }
880        return this;
881    }
882
883    /**
884     * Unsupported ListIterator operation.
885     *
886     * @param obj this parameter ignored.
887     * @throws UnsupportedOperationException Always thrown.
888     */
889    @Override
890    public void set(final String obj) {
891        throw new UnsupportedOperationException("set() is unsupported");
892    }
893
894    /**
895     * Sets the field delimiter character.
896     *
897     * @param delim  the delimiter character to use.
898     * @return {@code this} instance.
899     */
900    public StrTokenizer setDelimiterChar(final char delim) {
901        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
902    }
903
904    /**
905     * Sets the field delimiter matcher.
906     * <p>
907     * The delimiter is used to separate one token from another.
908     * </p>
909     *
910     * @param delim  the delimiter matcher to use.
911     * @return {@code this} instance.
912     */
913    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
914        if (delim == null) {
915            this.delimMatcher = StrMatcher.noneMatcher();
916        } else {
917            this.delimMatcher = delim;
918        }
919        return this;
920    }
921
922    /**
923     * Sets the field delimiter string.
924     *
925     * @param delim  the delimiter string to use.
926     * @return {@code this} instance.
927     */
928    public StrTokenizer setDelimiterString(final String delim) {
929        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
930    }
931
932    /**
933     * Sets whether the tokenizer should return empty tokens as null. The default for this property is false.
934     *
935     * @param emptyAsNull whether empty tokens are returned as null.
936     * @return {@code this} instance.
937     */
938    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
939        this.emptyAsNull = emptyAsNull;
940        return this;
941    }
942
943    /**
944     * Sets the character to ignore.
945     * <p>
946     * This character is ignored when parsing the String, unless it is within a quoted region.
947     * </p>
948     *
949     * @param ignored the ignored character to use.
950     * @return {@code this} instance.
951     */
952    public StrTokenizer setIgnoredChar(final char ignored) {
953        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
954    }
955
956    /**
957     * Sets the matcher for characters to ignore.
958     * <p>
959     * These characters are ignored when parsing the String, unless they are within a quoted region.
960     * </p>
961     *
962     * @param ignored the ignored matcher to use, null ignored.
963     * @return {@code this} instance.
964     */
965    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
966        if (ignored != null) {
967            this.ignoredMatcher = ignored;
968        }
969        return this;
970    }
971
972    /**
973     * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true.
974     *
975     * @param ignoreEmptyTokens whether empty tokens are not returned.
976     * @return {@code this} instance.
977     */
978    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
979        this.ignoreEmptyTokens = ignoreEmptyTokens;
980        return this;
981    }
982
983    /**
984     * Sets the quote character to use.
985     * <p>
986     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
987     * </p>
988     *
989     * @param quote the quote character to use.
990     * @return {@code this} instance.
991     */
992    public StrTokenizer setQuoteChar(final char quote) {
993        return setQuoteMatcher(StrMatcher.charMatcher(quote));
994    }
995
996    /**
997     * Sets the quote matcher to use.
998     * <p>
999     * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data.
1000     * </p>
1001     *
1002     * @param quote the quote matcher to use, null ignored.
1003     * @return {@code this} instance.
1004     */
1005    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
1006        if (quote != null) {
1007            this.quoteMatcher = quote;
1008        }
1009        return this;
1010    }
1011
1012    /**
1013     * Sets the matcher for characters to trim.
1014     * <p>
1015     * These characters are trimmed off on each side of the delimiter until the token or quote is found.
1016     * </p>
1017     *
1018     * @param trimmer the trimmer matcher to use, null ignored
1019     * @return {@code this} instance.
1020     */
1021    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1022        if (trimmer != null) {
1023            this.trimmerMatcher = trimmer;
1024        }
1025        return this;
1026    }
1027
1028    /**
1029     * Gets the number of tokens found in the String.
1030     *
1031     * @return The number of matched tokens.
1032     */
1033    public int size() {
1034        checkTokenized();
1035        return tokens.length;
1036    }
1037
1038    /**
1039     * Internal method to performs the tokenization.
1040     * <p>
1041     * Most users of this class do not need to call this method. This method will be called automatically by other (public) methods when required.
1042     * </p>
1043     * <p>
1044     * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass could alter the character array, offset or
1045     * count to be parsed, or call the tokenizer multiple times on multiple strings. It is also be possible to filter the results.
1046     * </p>
1047     * <p>
1048     * {@code StrTokenizer} will always pass a zero offset and a count equal to the length of the array to this method, however a subclass may pass other
1049     * values, or even an entirely different array.
1050     * </p>
1051     *
1052     * @param srcChars the character array being tokenized, may be null.
1053     * @param offset   the start position within the character array, must be valid.
1054     * @param count    the number of characters to tokenize, must be valid.
1055     * @return The modifiable list of String tokens, unmodifiable if null array or zero count.
1056     */
1057    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
1058        if (srcChars == null || count == 0) {
1059            return Collections.emptyList();
1060        }
1061        final StrBuilder buf = new StrBuilder();
1062        final List<String> tokenList = new ArrayList<>();
1063        int pos = offset;
1064
1065        // loop around the entire buffer
1066        while (pos >= 0 && pos < count) {
1067            // find next token
1068            pos = readNextToken(srcChars, pos, count, buf, tokenList);
1069
1070            // handle case where end of string is a delimiter
1071            if (pos >= count) {
1072                addToken(tokenList, StringUtils.EMPTY);
1073            }
1074        }
1075        return tokenList;
1076    }
1077
1078    /**
1079     * Gets the String content that the tokenizer is parsing.
1080     *
1081     * @return The string content being parsed.
1082     */
1083    @Override
1084    public String toString() {
1085        if (tokens == null) {
1086            return "StrTokenizer[not tokenized yet]";
1087        }
1088        return "StrTokenizer" + getTokenList();
1089    }
1090
1091}