@Deprecated
public class LanguageProfilerBuilder
extends java.lang.Object
| Modifier and Type | Class and Description |
|---|---|
(package private) static class |
LanguageProfilerBuilder.NGramEntry
Deprecated.
Inner class that describes a NGram
|
private static class |
LanguageProfilerBuilder.QuickStringBuffer
Deprecated.
|
| Modifier and Type | Field and Description |
|---|---|
(package private) static int |
ABSOLUTE_MAX_NGRAM_LENGTH
Deprecated.
The maximum length allowed for a ngram.
|
(package private) static int |
ABSOLUTE_MIN_NGRAM_LENGTH
Deprecated.
The minimum length allowed for a ngram.
|
(package private) static int |
DEFAULT_MAX_NGRAM_LENGTH
Deprecated.
The default max length of ngram
|
(package private) static int |
DEFAULT_MIN_NGRAM_LENGTH
Deprecated.
The default min length of ngram
|
(package private) static java.lang.String |
FILE_EXTENSION
Deprecated.
The ngram profile file extension
|
(package private) static int |
MAX_SIZE
Deprecated.
The profile max size (number of ngrams of the same size)
|
private int |
maxLength
Deprecated.
The max length of ngram
|
private int |
minLength
Deprecated.
The min length of ngram
|
private java.lang.String |
name
Deprecated.
The profile's name
|
private int[] |
ngramcounts
Deprecated.
The total number of ngrams occurences
|
private java.util.Map<java.lang.CharSequence,LanguageProfilerBuilder.NGramEntry> |
ngrams
Deprecated.
An index of the ngrams of the profile
|
private static java.lang.String |
SEP_CHARSEQ
Deprecated.
The String form of the separator char
|
(package private) static char |
SEPARATOR
Deprecated.
separator char
|
private java.util.List<LanguageProfilerBuilder.NGramEntry> |
sorted
Deprecated.
The NGrams of this profile sorted on the number of occurrences
|
private LanguageProfilerBuilder.QuickStringBuffer |
word
Deprecated.
A StringBuffer used during analysis
|
| Constructor and Description |
|---|
LanguageProfilerBuilder(java.lang.String name)
Deprecated.
Constructs a new ngram profile where minlen=3, maxlen=3
|
LanguageProfilerBuilder(java.lang.String name,
int minlen,
int maxlen)
Deprecated.
Constructs a new ngram profile
|
| Modifier and Type | Method and Description |
|---|---|
private void |
add(java.lang.CharSequence cs)
Deprecated.
Adds ngrams from a single word in this profile
|
private void |
add(LanguageProfilerBuilder.QuickStringBuffer word)
Deprecated.
Adds the last NGrams from the specified word.
|
void |
add(java.lang.StringBuffer word)
Deprecated.
Adds ngrams from a single word to this profile
|
private void |
add(java.lang.StringBuffer word,
int n)
Deprecated.
|
void |
analyze(java.lang.StringBuilder text)
Deprecated.
Analyzes a piece of text
|
static LanguageProfilerBuilder |
create(java.lang.String name,
java.io.InputStream is,
java.lang.String encoding)
Deprecated.
Creates a new Language profile from (preferably quite large - 5-10k of
lines) text file
|
java.lang.String |
getName()
Deprecated.
|
float |
getSimilarity(LanguageProfilerBuilder another)
Deprecated.
Calculates a score how well NGramProfiles match each other
|
java.util.List<LanguageProfilerBuilder.NGramEntry> |
getSorted()
Deprecated.
Returns a sorted list of ngrams (sort done by 1.
|
void |
load(java.io.InputStream is)
Deprecated.
Loads a ngram profile from an InputStream (assumes UTF-8 encoded content)
|
static void |
main(java.lang.String[] args)
Deprecated.
main method used for testing only
|
protected void |
normalize()
Deprecated.
Normalizes the profile (calculates the ngrams frequencies)
|
void |
save(java.io.OutputStream os)
Deprecated.
Writes NGramProfile content into OutputStream, content is outputted with
UTF-8 encoding
|
java.lang.String |
toString()
Deprecated.
|
static final int ABSOLUTE_MIN_NGRAM_LENGTH
static final int ABSOLUTE_MAX_NGRAM_LENGTH
static final int DEFAULT_MIN_NGRAM_LENGTH
static final int DEFAULT_MAX_NGRAM_LENGTH
static final java.lang.String FILE_EXTENSION
static final int MAX_SIZE
static final char SEPARATOR
private static final java.lang.String SEP_CHARSEQ
private java.lang.String name
private java.util.List<LanguageProfilerBuilder.NGramEntry> sorted
private int minLength
private int maxLength
private int[] ngramcounts
private java.util.Map<java.lang.CharSequence,LanguageProfilerBuilder.NGramEntry> ngrams
private LanguageProfilerBuilder.QuickStringBuffer word
public LanguageProfilerBuilder(java.lang.String name,
int minlen,
int maxlen)
name - is the name of the profileminlen - is the min length of ngram sequencesmaxlen - is the max length of ngram sequencespublic LanguageProfilerBuilder(java.lang.String name)
name - is a name of profile, usually two length stringpublic java.lang.String getName()
public void add(java.lang.StringBuffer word)
word - is the word to addprivate void add(LanguageProfilerBuilder.QuickStringBuffer word)
private void add(java.lang.CharSequence cs)
cs - char sequence to addpublic void analyze(java.lang.StringBuilder text)
text - the text to be analyzedprivate void add(java.lang.StringBuffer word,
int n)
word - n - sequence lengthprotected void normalize()
public java.util.List<LanguageProfilerBuilder.NGramEntry> getSorted()
public java.lang.String toString()
toString in class java.lang.Objectpublic float getSimilarity(LanguageProfilerBuilder another) throws TikaException
another - ngram profile to compare againstTikaException - if could not calculate a scorepublic void load(java.io.InputStream is)
throws java.io.IOException
is - the InputStream to readjava.io.IOExceptionpublic static LanguageProfilerBuilder create(java.lang.String name, java.io.InputStream is, java.lang.String encoding) throws TikaException
name - to be given for the profileis - a stream to be readencoding - is the encoding of streamTikaException - if could not create a language profilepublic void save(java.io.OutputStream os)
throws java.io.IOException
os - the Stream to output tojava.io.IOExceptionpublic static void main(java.lang.String[] args)
args -