diff options
author | jkim <jkim@FreeBSD.org> | 2013-04-17 02:37:03 +0800 |
---|---|---|
committer | jkim <jkim@FreeBSD.org> | 2013-04-17 02:37:03 +0800 |
commit | ec2d117cef25ff7a0cd174bf1a54c5daeec44031 (patch) | |
tree | 4262e85bc4ab6c79b0216e8fc84488229d4e64f4 /textproc/clucene | |
parent | 21cdbdb6903f4fbcb545ccd637d94c622c218f85 (diff) | |
download | freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.tar.gz freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.tar.zst freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.zip |
- Implement efficient BitSet::nextSetBit() to reduce diff against upstream.
http://clucene.git.sourceforge.net/git/gitweb.cgi?p=clucene/clucene;a=commitdiff;h=17e53d7
- Fix a buffer overflow in CJKAnalyzer. Somehow the upstream missed this
in 2.3.3.4 branch.
http://clucene.svn.sourceforge.net/viewvc/clucene?view=revision&revision=2630
- Fix potential memory leaks in libstemmer. Merged from Snowball changes.
http://svn.tartarus.org/snowball/trunk/snowball/libstemmer/libstemmer_c.in?r1=409&r2=520&view=patch
- Implement SnowballAnalyzer::reusableTokenStream(). [1] Also, this patch
fixes memory leaks found by the submitter.
Submitted by: Kishore Ramareddy (kishore at niksun dot com)
(initial version) [1]
Feature safe: yes
Diffstat (limited to 'textproc/clucene')
7 files changed, 206 insertions, 1 deletions
diff --git a/textproc/clucene/Makefile b/textproc/clucene/Makefile index b04abd3b79a0..6ca77986fc42 100644 --- a/textproc/clucene/Makefile +++ b/textproc/clucene/Makefile @@ -3,7 +3,7 @@ PORTNAME= clucene PORTVERSION= 2.3.3.4 -PORTREVISION= 1 +PORTREVISION= 2 CATEGORIES= textproc MASTER_SITES= SF/${PORTNAME}/${PORTNAME}-core-unstable/2.3 DISTNAME= ${PORTNAME}-core-${PORTVERSION} diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h b/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h new file mode 100644 index 000000000000..8a7d500a3bcd --- /dev/null +++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h @@ -0,0 +1,11 @@ +--- src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h.orig 2011-03-16 20:21:07.000000000 -0400 ++++ src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h 2013-03-29 18:46:22.000000000 -0400 +@@ -39,7 +39,7 @@ + * character buffer, store the characters which are used to compose <br> + * the returned Token + */ +- TCHAR buffer[LUCENE_MAX_WORD_LEN]; ++ TCHAR buffer[LUCENE_MAX_WORD_LEN+1]; + + /** + * I/O buffer, used to store the content of the input(one of the <br> diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp new file mode 100644 index 000000000000..45f8937b70de --- /dev/null +++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp @@ -0,0 +1,74 @@ +--- src/contribs-lib/CLucene/snowball/Snowball.cpp.orig 2011-03-16 20:21:07.000000000 -0400 ++++ src/contribs-lib/CLucene/snowball/Snowball.cpp 2013-04-01 19:14:15.000000000 -0400 +@@ -19,16 +19,31 @@ + + CL_NS_DEF2(analysis,snowball) + ++ class SnowballAnalyzer::SavedStreams : public TokenStream { ++ public: ++ StandardTokenizer* tokenStream; ++ TokenStream* filteredTokenStream; ++ ++ SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) {} ++ void close(){} ++ Token* next(Token* token) {return NULL;} ++ }; ++ + /** Builds the named analyzer with no stop words. */ + SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) { + this->language = STRDUP_TtoT(language); + stopSet = NULL; + } + +- SnowballAnalyzer::~SnowballAnalyzer(){ +- _CLDELETE_CARRAY(language); +- if ( stopSet != NULL ) +- _CLDELETE(stopSet); ++ SnowballAnalyzer::~SnowballAnalyzer() { ++ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream()); ++ if (streams != NULL) { ++ _CLDELETE(streams->filteredTokenStream); ++ _CLDELETE(streams); ++ } ++ _CLDELETE_CARRAY(language); ++ if (stopSet != NULL) ++ _CLDELETE(stopSet); + } + + /** Builds the named analyzer with the given stop words. +@@ -62,12 +77,29 @@ + result = _CLNEW SnowballFilter(result, language, true); + return result; + } +- +- +- +- +- +- ++ ++ TokenStream* SnowballAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) { ++ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream()); ++ ++ if (streams == NULL) { ++ streams = _CLNEW SavedStreams(); ++ BufferedReader* bufferedReader = reader->__asBufferedReader(); ++ ++ if (bufferedReader == NULL) ++ streams->tokenStream = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true); ++ else ++ streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader); ++ ++ streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true); ++ streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true); ++ if (stopSet != NULL) ++ streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet); ++ streams->filteredTokenStream = _CLNEW SnowballFilter(streams->filteredTokenStream, language, true); ++ setPreviousTokenStream(streams); ++ } else ++ streams->tokenStream->reset(reader); ++ return streams->filteredTokenStream; ++ } + + /** Construct the named stemming filter. + * diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h new file mode 100644 index 000000000000..7de13249ac39 --- /dev/null +++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h @@ -0,0 +1,19 @@ +--- src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h.orig 2011-03-16 20:21:07.000000000 -0400 ++++ src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h 2013-04-01 18:25:10.000000000 -0400 +@@ -22,6 +22,7 @@ + class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer { + TCHAR* language; + CLTCSetList* stopSet; ++ class SavedStreams; + + public: + /** Builds the named analyzer with no stop words. */ +@@ -37,6 +38,8 @@ + StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ + TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); + TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader); ++ ++ TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); + }; + + CL_NS_END2 diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c new file mode 100644 index 000000000000..0a7342740bf3 --- /dev/null +++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c @@ -0,0 +1,24 @@ +--- src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c.orig 2011-03-16 20:21:07.000000000 -0400 ++++ src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c 2013-03-29 18:54:39.000000000 -0400 +@@ -35,9 +35,8 @@ + { + stemmer_encoding enc; + struct stemmer_modules * module; +- struct sb_stemmer * stemmer = +- (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); +- if (stemmer == NULL) return NULL; ++ struct sb_stemmer * stemmer; ++ + enc = sb_getenc(charenc); + if (enc == ENC_UNKNOWN) return NULL; + +@@ -46,6 +45,9 @@ + } + if (module->name == NULL) return NULL; + ++ stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); ++ if (stemmer == NULL) return NULL; ++ + stemmer->create = module->create; + stemmer->close = module->close; + stemmer->stem = module->stem; diff --git a/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp new file mode 100644 index 000000000000..87c0753cfeae --- /dev/null +++ b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp @@ -0,0 +1,67 @@ +--- src/core/CLucene/util/BitSet.cpp.orig 2011-03-16 20:21:07.000000000 -0400 ++++ src/core/CLucene/util/BitSet.cpp 2013-03-29 17:57:05.000000000 -0400 +@@ -32,6 +32,25 @@ + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + ++const uint8_t BitSet::BYTE_OFFSETS[256] = { ++ 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, ++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; ++ ++ + BitSet::BitSet( const BitSet& copy ) : + _size( copy._size ), + _count(-1) +@@ -180,19 +199,32 @@ + return factor * (4 + (8+40)*count()) < size(); + } + +- int32_t BitSet::nextSetBit(int32_t fromIndex) const { ++ int32_t BitSet::nextSetBit(int32_t fromIndex) const ++ { + if (fromIndex < 0) + _CLTHROWT(CL_ERR_IndexOutOfBounds, _T("fromIndex < 0")); + + if (fromIndex >= _size) + return -1; + +- while (true) { +- if ((bits[fromIndex >> 3] & (1 << (fromIndex & 7))) != 0) +- return fromIndex; +- if (++fromIndex == _size) +- return -1; ++ int _max = ( _size+7 ) >> 3; ++ ++ unsigned int i = (int)( fromIndex>>3 ); ++ unsigned int subIndex = fromIndex & 0x7; // index within the byte ++ uint8_t byte = bits[i] >> subIndex; // skip all the bits to the right of index ++ ++ if ( byte != 0 ) ++ { ++ return ( ( i<<3 ) + subIndex + BYTE_OFFSETS[ byte ] ); ++ } ++ ++ while( ++i < _max ) ++ { ++ byte = bits[i]; ++ if ( byte != 0 ) ++ return ( ( i<<3 ) + BYTE_OFFSETS[ byte ] ); + } ++ return -1; + } + + CL_NS_END diff --git a/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h new file mode 100644 index 000000000000..d648b4cab3c8 --- /dev/null +++ b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h @@ -0,0 +1,10 @@ +--- src/core/CLucene/util/BitSet.h.orig 2011-03-16 20:21:07.000000000 -0400 ++++ src/core/CLucene/util/BitSet.h 2013-03-29 17:57:05.000000000 -0400 +@@ -39,6 +39,7 @@ + /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */ + bool isSparse(); + static const uint8_t BYTE_COUNTS[256]; ++ static const uint8_t BYTE_OFFSETS[256]; + protected: + BitSet( const BitSet& copy ); + |