aboutsummaryrefslogtreecommitdiffstats
path: root/textproc/clucene
diff options
context:
space:
mode:
authorjkim <jkim@FreeBSD.org>2013-04-17 02:37:03 +0800
committerjkim <jkim@FreeBSD.org>2013-04-17 02:37:03 +0800
commitec2d117cef25ff7a0cd174bf1a54c5daeec44031 (patch)
tree4262e85bc4ab6c79b0216e8fc84488229d4e64f4 /textproc/clucene
parent21cdbdb6903f4fbcb545ccd637d94c622c218f85 (diff)
downloadfreebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.tar.gz
freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.tar.zst
freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.zip
- Implement efficient BitSet::nextSetBit() to reduce diff against upstream.
http://clucene.git.sourceforge.net/git/gitweb.cgi?p=clucene/clucene;a=commitdiff;h=17e53d7 - Fix a buffer overflow in CJKAnalyzer. Somehow the upstream missed this in 2.3.3.4 branch. http://clucene.svn.sourceforge.net/viewvc/clucene?view=revision&revision=2630 - Fix potential memory leaks in libstemmer. Merged from Snowball changes. http://svn.tartarus.org/snowball/trunk/snowball/libstemmer/libstemmer_c.in?r1=409&r2=520&view=patch - Implement SnowballAnalyzer::reusableTokenStream(). [1] Also, this patch fixes memory leaks found by the submitter. Submitted by: Kishore Ramareddy (kishore at niksun dot com) (initial version) [1] Feature safe: yes
Diffstat (limited to 'textproc/clucene')
-rw-r--r--textproc/clucene/Makefile2
-rw-r--r--textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h11
-rw-r--r--textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp74
-rw-r--r--textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h19
-rw-r--r--textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c24
-rw-r--r--textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp67
-rw-r--r--textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h10
7 files changed, 206 insertions, 1 deletions
diff --git a/textproc/clucene/Makefile b/textproc/clucene/Makefile
index b04abd3b79a0..6ca77986fc42 100644
--- a/textproc/clucene/Makefile
+++ b/textproc/clucene/Makefile
@@ -3,7 +3,7 @@
PORTNAME= clucene
PORTVERSION= 2.3.3.4
-PORTREVISION= 1
+PORTREVISION= 2
CATEGORIES= textproc
MASTER_SITES= SF/${PORTNAME}/${PORTNAME}-core-unstable/2.3
DISTNAME= ${PORTNAME}-core-${PORTVERSION}
diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h b/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h
new file mode 100644
index 000000000000..8a7d500a3bcd
--- /dev/null
+++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h
@@ -0,0 +1,11 @@
+--- src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h.orig 2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h 2013-03-29 18:46:22.000000000 -0400
+@@ -39,7 +39,7 @@
+ * character buffer, store the characters which are used to compose <br>
+ * the returned Token
+ */
+- TCHAR buffer[LUCENE_MAX_WORD_LEN];
++ TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
+
+ /**
+ * I/O buffer, used to store the content of the input(one of the <br>
diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp
new file mode 100644
index 000000000000..45f8937b70de
--- /dev/null
+++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp
@@ -0,0 +1,74 @@
+--- src/contribs-lib/CLucene/snowball/Snowball.cpp.orig 2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/snowball/Snowball.cpp 2013-04-01 19:14:15.000000000 -0400
+@@ -19,16 +19,31 @@
+
+ CL_NS_DEF2(analysis,snowball)
+
++ class SnowballAnalyzer::SavedStreams : public TokenStream {
++ public:
++ StandardTokenizer* tokenStream;
++ TokenStream* filteredTokenStream;
++
++ SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) {}
++ void close(){}
++ Token* next(Token* token) {return NULL;}
++ };
++
+ /** Builds the named analyzer with no stop words. */
+ SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) {
+ this->language = STRDUP_TtoT(language);
+ stopSet = NULL;
+ }
+
+- SnowballAnalyzer::~SnowballAnalyzer(){
+- _CLDELETE_CARRAY(language);
+- if ( stopSet != NULL )
+- _CLDELETE(stopSet);
++ SnowballAnalyzer::~SnowballAnalyzer() {
++ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
++ if (streams != NULL) {
++ _CLDELETE(streams->filteredTokenStream);
++ _CLDELETE(streams);
++ }
++ _CLDELETE_CARRAY(language);
++ if (stopSet != NULL)
++ _CLDELETE(stopSet);
+ }
+
+ /** Builds the named analyzer with the given stop words.
+@@ -62,12 +77,29 @@
+ result = _CLNEW SnowballFilter(result, language, true);
+ return result;
+ }
+-
+-
+-
+-
+-
+-
++
++ TokenStream* SnowballAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
++ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
++
++ if (streams == NULL) {
++ streams = _CLNEW SavedStreams();
++ BufferedReader* bufferedReader = reader->__asBufferedReader();
++
++ if (bufferedReader == NULL)
++ streams->tokenStream = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true);
++ else
++ streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
++
++ streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
++ streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
++ if (stopSet != NULL)
++ streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
++ streams->filteredTokenStream = _CLNEW SnowballFilter(streams->filteredTokenStream, language, true);
++ setPreviousTokenStream(streams);
++ } else
++ streams->tokenStream->reset(reader);
++ return streams->filteredTokenStream;
++ }
+
+ /** Construct the named stemming filter.
+ *
diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h
new file mode 100644
index 000000000000..7de13249ac39
--- /dev/null
+++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h
@@ -0,0 +1,19 @@
+--- src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h.orig 2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h 2013-04-01 18:25:10.000000000 -0400
+@@ -22,6 +22,7 @@
+ class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer {
+ TCHAR* language;
+ CLTCSetList* stopSet;
++ class SavedStreams;
+
+ public:
+ /** Builds the named analyzer with no stop words. */
+@@ -37,6 +38,8 @@
+ StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
+ TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+ TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader);
++
++ TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+ };
+
+ CL_NS_END2
diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c
new file mode 100644
index 000000000000..0a7342740bf3
--- /dev/null
+++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c
@@ -0,0 +1,24 @@
+--- src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c.orig 2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c 2013-03-29 18:54:39.000000000 -0400
+@@ -35,9 +35,8 @@
+ {
+ stemmer_encoding enc;
+ struct stemmer_modules * module;
+- struct sb_stemmer * stemmer =
+- (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+- if (stemmer == NULL) return NULL;
++ struct sb_stemmer * stemmer;
++
+ enc = sb_getenc(charenc);
+ if (enc == ENC_UNKNOWN) return NULL;
+
+@@ -46,6 +45,9 @@
+ }
+ if (module->name == NULL) return NULL;
+
++ stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
++ if (stemmer == NULL) return NULL;
++
+ stemmer->create = module->create;
+ stemmer->close = module->close;
+ stemmer->stem = module->stem;
diff --git a/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp
new file mode 100644
index 000000000000..87c0753cfeae
--- /dev/null
+++ b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp
@@ -0,0 +1,67 @@
+--- src/core/CLucene/util/BitSet.cpp.orig 2011-03-16 20:21:07.000000000 -0400
++++ src/core/CLucene/util/BitSet.cpp 2013-03-29 17:57:05.000000000 -0400
+@@ -32,6 +32,25 @@
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+
++const uint8_t BitSet::BYTE_OFFSETS[256] = {
++ 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
++
++
+ BitSet::BitSet( const BitSet& copy ) :
+ _size( copy._size ),
+ _count(-1)
+@@ -180,19 +199,32 @@
+ return factor * (4 + (8+40)*count()) < size();
+ }
+
+- int32_t BitSet::nextSetBit(int32_t fromIndex) const {
++ int32_t BitSet::nextSetBit(int32_t fromIndex) const
++ {
+ if (fromIndex < 0)
+ _CLTHROWT(CL_ERR_IndexOutOfBounds, _T("fromIndex < 0"));
+
+ if (fromIndex >= _size)
+ return -1;
+
+- while (true) {
+- if ((bits[fromIndex >> 3] & (1 << (fromIndex & 7))) != 0)
+- return fromIndex;
+- if (++fromIndex == _size)
+- return -1;
++ int _max = ( _size+7 ) >> 3;
++
++ unsigned int i = (int)( fromIndex>>3 );
++ unsigned int subIndex = fromIndex & 0x7; // index within the byte
++ uint8_t byte = bits[i] >> subIndex; // skip all the bits to the right of index
++
++ if ( byte != 0 )
++ {
++ return ( ( i<<3 ) + subIndex + BYTE_OFFSETS[ byte ] );
++ }
++
++ while( ++i < _max )
++ {
++ byte = bits[i];
++ if ( byte != 0 )
++ return ( ( i<<3 ) + BYTE_OFFSETS[ byte ] );
+ }
++ return -1;
+ }
+
+ CL_NS_END
diff --git a/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h
new file mode 100644
index 000000000000..d648b4cab3c8
--- /dev/null
+++ b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h
@@ -0,0 +1,10 @@
+--- src/core/CLucene/util/BitSet.h.orig 2011-03-16 20:21:07.000000000 -0400
++++ src/core/CLucene/util/BitSet.h 2013-03-29 17:57:05.000000000 -0400
+@@ -39,6 +39,7 @@
+ /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
+ bool isSparse();
+ static const uint8_t BYTE_COUNTS[256];
++ static const uint8_t BYTE_OFFSETS[256];
+ protected:
+ BitSet( const BitSet& copy );
+