HTMLStripCharFilter@Deprecated
public class LegacyHTMLStripCharFilter
extends org.apache.lucene.analysis.BaseCharFilter
This class is NOT recommended for new users and should be considered UNSUPPORTED.
In Solr version 3.5 and earlier, HTMLStripCharFilter(Factory) had known bugs in the offsets it provided, triggering e.g. exceptions in highlighting.
This class is provided as possible alternative for people who depend on the "broken" behavior of HTMLStripCharFilter in Solr version 3.5 and earlier, and/or who don't like the changes introduced by the Solr 3.6+ version of HTMLStripCharFilterFactory. (See the 3.6.0 release section of solr/CHANGES.txt for a list of differences in behavior.)
| Modifier and Type | Field and Description |
|---|---|
static int |
DEFAULT_READ_AHEAD
Deprecated.
|
private static java.util.HashMap<java.lang.String,java.lang.Character> |
entityTable
Deprecated.
|
private static int |
EOF
Deprecated.
|
private java.util.Set<java.lang.String> |
escapedTags
Deprecated.
|
private int |
lastMark
Deprecated.
|
private static int |
MATCH
Deprecated.
|
private static int |
MISMATCH
Deprecated.
|
private int |
numEaten
Deprecated.
|
private int |
numRead
Deprecated.
|
private int |
numReturned
Deprecated.
|
private int |
numWhitespace
Deprecated.
|
private java.lang.StringBuilder |
pushed
Deprecated.
|
private int |
readAheadLimit
Deprecated.
|
private int |
safeReadAheadLimit
Deprecated.
|
private java.lang.StringBuilder |
sb
Deprecated.
|
| Constructor and Description |
|---|
LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source)
Deprecated.
|
LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source,
java.util.Set<java.lang.String> escapedTags)
Deprecated.
|
LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source,
java.util.Set<java.lang.String> escapedTags,
int readAheadLimit)
Deprecated.
|
| Modifier and Type | Method and Description | |
|---|---|---|
void |
close()
Deprecated.
|
|
private int |
eatSSI()
Deprecated.
|
|
(package private) int |
findEndTag()
Deprecated.
|
|
int |
getReadAheadLimit()
Deprecated.
|
|
private boolean |
isAlpha(int ch)
Deprecated.
|
|
private boolean |
isDigit(int ch)
Deprecated.
|
|
private boolean |
isFirstIdChar(int ch)
Deprecated.
|
|
private boolean |
isHex(int ch)
Deprecated.
|
|
private boolean |
isIdChar(int ch)
Deprecated.
From HTML 4.0
[4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
[5] Name ::= (Letter | '_' | ':') (NameChar)*
[6] Names ::= Name (#x20 Name)*
[7] Nmtoken ::= (NameChar)+
[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
|
|
private boolean |
isSpace(int ch)
Deprecated.
|
|
static void |
main(java.lang.String[] args)
Deprecated.
|
|
private int |
next()
Deprecated.
|
|
private int |
nextSkipWS()
Deprecated.
|
|
private int |
peek()
Deprecated.
|
|
private void |
push(int ch)
Deprecated.
|
|
int |
read()
Deprecated.
|
|
int |
read(char[] cbuf,
int off,
int len)
Deprecated.
|
|
private int |
readAttr2()
Deprecated.
[10] AttValue ::= '"' ([^<&"] | Reference)* '"'
| "'" ([^<&'] | Reference)* "'"
need to also handle unquoted attributes, and attributes w/o values:
|
|
private int |
readBang(boolean inScript)
Deprecated.
valid comments according to HTML specs
Hello -->
#comments inside of an entity decl:
Turns out, IE & mozilla don't parse comments correctly.
|
|
private int |
readComment(boolean inScript)
Deprecated.
|
|
private int |
readEntity()
Deprecated.
|
|
private int |
readName(boolean checkEscaped)
Deprecated.
|
|
private int |
readNumericEntity()
Deprecated.
|
|
private int |
readProcessingInstruction()
Deprecated.
|
|
private int |
readScriptString()
Deprecated.
|
|
private int |
readTag()
Deprecated.
|
|
private void |
restoreState()
Deprecated.
|
|
private void |
saveState()
Deprecated.
|
addOffCorrectMap, correct, getLastCumulativeDiffprivate int readAheadLimit
private int safeReadAheadLimit
private int numWhitespace
private int numRead
private int numEaten
private int numReturned
private int lastMark
private java.util.Set<java.lang.String> escapedTags
private final java.lang.StringBuilder pushed
private static final int EOF
private static final int MISMATCH
private static final int MATCH
private final java.lang.StringBuilder sb
public static final int DEFAULT_READ_AHEAD
private static final java.util.HashMap<java.lang.String,java.lang.Character> entityTable
public LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source)
public LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source,
java.util.Set<java.lang.String> escapedTags)
public LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source,
java.util.Set<java.lang.String> escapedTags,
int readAheadLimit)
public static void main(java.lang.String[] args)
throws java.io.IOException
java.io.IOExceptionpublic int getReadAheadLimit()
private int next()
throws java.io.IOException
java.io.IOExceptionprivate int nextSkipWS()
throws java.io.IOException
java.io.IOExceptionprivate int peek()
throws java.io.IOException
java.io.IOExceptionprivate void push(int ch)
private boolean isSpace(int ch)
private boolean isHex(int ch)
private boolean isAlpha(int ch)
private boolean isDigit(int ch)
private boolean isIdChar(int ch)
private boolean isFirstIdChar(int ch)
private void saveState()
throws java.io.IOException
java.io.IOExceptionprivate void restoreState()
throws java.io.IOException
java.io.IOExceptionprivate int readNumericEntity()
throws java.io.IOException
java.io.IOExceptionprivate int readEntity()
throws java.io.IOException
java.io.IOExceptionprivate int readBang(boolean inScript)
throws java.io.IOException
java.io.IOExceptionprivate int readComment(boolean inScript)
throws java.io.IOException
java.io.IOExceptionprivate int readTag()
throws java.io.IOException
java.io.IOExceptionint findEndTag()
throws java.io.IOException
java.io.IOExceptionprivate int readScriptString()
throws java.io.IOException
java.io.IOExceptionprivate int readName(boolean checkEscaped)
throws java.io.IOException
java.io.IOExceptionprivate int readAttr2()
throws java.io.IOException
java.io.IOExceptionprivate int eatSSI()
throws java.io.IOException
java.io.IOExceptionprivate int readProcessingInstruction()
throws java.io.IOException
java.io.IOExceptionpublic int read()
throws java.io.IOException
read in class java.io.Readerjava.io.IOExceptionpublic int read(char[] cbuf,
int off,
int len)
throws java.io.IOException
read in class org.apache.lucene.analysis.CharFilterjava.io.IOExceptionpublic void close()
throws java.io.IOException
close in interface java.io.Closeableclose in interface java.lang.AutoCloseableclose in class org.apache.lucene.analysis.CharFilterjava.io.IOException