public final class WordDelimiterIterator
extends java.lang.Object
| Modifier and Type | Field and Description |
|---|---|
static int |
ALPHA |
static int |
ALPHANUM |
private byte[] |
charTypeTable |
(package private) int |
current
Beginning of subword
|
static byte[] |
DEFAULT_WORD_DELIM_TABLE |
(package private) static int |
DIGIT |
static int |
DONE
Indicates the end of iteration
|
(package private) int |
end
End of subword
|
(package private) int |
endBounds
end position of text, excluding trailing delimiters
|
private boolean |
hasFinalPossessive |
(package private) int |
length |
(package private) static int |
LOWER |
private boolean |
skipPossessive
if true, need to skip over a possessive found in the last call to next()
|
(package private) boolean |
splitOnCaseChange
If false, causes case changes to be ignored (subwords will only be generated
given SUBWORD_DELIM tokens).
|
(package private) boolean |
splitOnNumerics
If false, causes numeric changes to be ignored (subwords will only be generated
given SUBWORD_DELIM tokens).
|
(package private) int |
startBounds
start position of text, excluding leading delimiters
|
(package private) boolean |
stemEnglishPossessive
If true, causes trailing "'s" to be removed for each subword.
|
(package private) static int |
SUBWORD_DELIM |
(package private) char[] |
text |
(package private) static int |
UPPER |
| Constructor and Description |
|---|
WordDelimiterIterator(byte[] charTypeTable,
boolean splitOnCaseChange,
boolean splitOnNumerics,
boolean stemEnglishPossessive)
Create a new WordDelimiterIterator operating with the supplied rules.
|
| Modifier and Type | Method and Description |
|---|---|
private int |
charType(int ch)
Determines the type of the given character
|
private boolean |
endsWithPossessive(int pos)
Determines if the text at the given position indicates an English possessive which should be removed
|
static byte |
getType(int ch)
Computes the type of the given character
|
(package private) static boolean |
isAlpha(int type)
Checks if the given word type includes
ALPHA |
private boolean |
isBreak(int lastType,
int type)
Determines whether the transition from lastType to type indicates a break
|
(package private) static boolean |
isDigit(int type)
Checks if the given word type includes
DIGIT |
(package private) boolean |
isSingleWord()
Determines if the current word contains only one subword.
|
(package private) static boolean |
isSubwordDelim(int type)
Checks if the given word type includes
SUBWORD_DELIM |
(package private) static boolean |
isUpper(int type)
Checks if the given word type includes
UPPER |
(package private) int |
next()
Advance to the next subword in the string.
|
private void |
setBounds()
Set the internal word bounds (remove leading and trailing delimiters).
|
(package private) void |
setText(char[] text,
int length)
Reset the text to a new value, and reset all state
|
(package private) int |
type()
Return the type of the current subword.
|
static final int LOWER
static final int UPPER
static final int DIGIT
static final int SUBWORD_DELIM
public static final int ALPHA
public static final int ALPHANUM
public static final int DONE
public static final byte[] DEFAULT_WORD_DELIM_TABLE
char[] text
int length
int startBounds
int endBounds
int current
int end
private boolean hasFinalPossessive
final boolean splitOnCaseChange
final boolean splitOnNumerics
final boolean stemEnglishPossessive
private final byte[] charTypeTable
private boolean skipPossessive
WordDelimiterIterator(byte[] charTypeTable,
boolean splitOnCaseChange,
boolean splitOnNumerics,
boolean stemEnglishPossessive)
charTypeTable - table containing character typessplitOnCaseChange - if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless)splitOnNumerics - if true, causes "j2se" to be three tokens; "j" "2" "se"stemEnglishPossessive - if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"int next()
DONE if all subwords have been returnedint type()
void setText(char[] text,
int length)
text - New textlength - length of the textprivate boolean isBreak(int lastType,
int type)
lastType - Last subword typetype - Current subword typetrue if the transition indicates a break, false otherwiseboolean isSingleWord()
true if the current word contains only one subword, false otherwiseprivate void setBounds()
private boolean endsWithPossessive(int pos)
pos - Position in the text to check if it indicates an English possessivetrue if the text at the position indicates an English possessive, false otherwiseprivate int charType(int ch)
ch - Character whose type is to be determinedpublic static byte getType(int ch)
ch - Character whose type is to be determinedstatic boolean isAlpha(int type)
ALPHAtype - Word type to checktrue if the type contains ALPHA, false otherwisestatic boolean isDigit(int type)
DIGITtype - Word type to checktrue if the type contains DIGIT, false otherwisestatic boolean isSubwordDelim(int type)
SUBWORD_DELIMtype - Word type to checktrue if the type contains SUBWORD_DELIM, false otherwisestatic boolean isUpper(int type)
UPPERtype - Word type to checktrue if the type contains UPPER, false otherwise