- Implement efficient BitSet::nextSetBit() to reduce diff against upstream.

http://clucene.git.sourceforge.net/git/gitweb.cgi?p=clucene/clucene;a=commitdiff;h=17e53d7 - Fix a buffer overflow in CJKAnalyzer. Somehow the upstream missed this in 2.3.3.4 branch. http://clucene.svn.sourceforge.net/viewvc/clucene?view=revision&revision=2630 - Fix potential memory leaks in libstemmer. Merged from Snowball changes. http://svn.tartarus.org/snowball/trunk/snowball/libstemmer/libstemmer_c.in?r1=409&r2=520&view=patch - Implement SnowballAnalyzer::reusableTokenStream(). [1] Also, this patch fixes memory leaks found by the submitter. Submitted by: Kishore Ramareddy (kishore at niksun dot com) (initial version) [1] Feature safe: yes
author: jkim <jkim@FreeBSD.org> 2013-04-17 02:37:03 +0800
committer: jkim <jkim@FreeBSD.org> 2013-04-17 02:37:03 +0800
commit: ec2d117cef25ff7a0cd174bf1a54c5daeec44031 (patch)
tree: 4262e85bc4ab6c79b0216e8fc84488229d4e64f4 /textproc/clucene
parent: 21cdbdb6903f4fbcb545ccd637d94c622c218f85 (diff)
download: freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.tar.gz
freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.tar.zst
freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.zip
7 files changed, 206 insertions, 1 deletions
diff --git a/textproc/clucene/Makefile b/textproc/clucene/Makefile
index b04abd3b79a0..6ca77986fc42 100644
--- a/textproc/clucene/Makefile
+++ b/textproc/clucene/Makefile
@@ -3,7 +3,7 @@
 
 PORTNAME=	clucene
 PORTVERSION=	2.3.3.4
-PORTREVISION=	1
+PORTREVISION=	2
 CATEGORIES=	textproc
 MASTER_SITES=	SF/${PORTNAME}/${PORTNAME}-core-unstable/2.3
 DISTNAME=	${PORTNAME}-core-${PORTVERSION}
diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h b/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h
new file mode 100644
index 000000000000..8a7d500a3bcd
--- /dev/null
+++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h
@@ -0,0 +1,11 @@
+--- src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h	2013-03-29 18:46:22.000000000 -0400
+@@ -39,7 +39,7 @@
+      * character buffer, store the characters which are used to compose <br>
+      * the returned Token
+      */
+-    TCHAR buffer[LUCENE_MAX_WORD_LEN];
++    TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
+ 
+     /**
+      * I/O buffer, used to store the content of the input(one of the <br>
diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp
new file mode 100644
index 000000000000..45f8937b70de
--- /dev/null
+++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp
@@ -0,0 +1,74 @@
+--- src/contribs-lib/CLucene/snowball/Snowball.cpp.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/snowball/Snowball.cpp	2013-04-01 19:14:15.000000000 -0400
+@@ -19,16 +19,31 @@
+ 
+ CL_NS_DEF2(analysis,snowball)
+ 
++  class SnowballAnalyzer::SavedStreams : public TokenStream {
++  public:
++    StandardTokenizer* tokenStream;
++    TokenStream* filteredTokenStream;
++
++    SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) {}
++    void close(){}
++    Token* next(Token* token) {return NULL;}
++  };
++  
+   /** Builds the named analyzer with no stop words. */
+   SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) {
+     this->language = STRDUP_TtoT(language);
+ 	stopSet = NULL;
+   }
+ 
+-  SnowballAnalyzer::~SnowballAnalyzer(){
+-	  _CLDELETE_CARRAY(language);
+-	  if ( stopSet != NULL )
+-		  _CLDELETE(stopSet);
++  SnowballAnalyzer::~SnowballAnalyzer() {
++    SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
++    if (streams != NULL) {
++      _CLDELETE(streams->filteredTokenStream);
++      _CLDELETE(streams);
++    }
++    _CLDELETE_CARRAY(language);
++    if (stopSet != NULL)
++      _CLDELETE(stopSet);
+   }
+ 
+   /** Builds the named analyzer with the given stop words.
+@@ -62,12 +77,29 @@
+     result = _CLNEW SnowballFilter(result, language, true);
+     return result;
+   }
+-  
+-  
+-  
+-  
+-  
+-  
++
++  TokenStream* SnowballAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
++    SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
++
++    if (streams == NULL) {
++      streams = _CLNEW SavedStreams();
++      BufferedReader* bufferedReader = reader->__asBufferedReader();
++
++      if (bufferedReader == NULL)
++        streams->tokenStream = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true);
++      else
++        streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
++
++      streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
++      streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
++      if (stopSet != NULL)
++        streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
++      streams->filteredTokenStream = _CLNEW SnowballFilter(streams->filteredTokenStream, language, true);
++      setPreviousTokenStream(streams);
++    } else
++      streams->tokenStream->reset(reader);
++    return streams->filteredTokenStream;
++  }
+   
+     /** Construct the named stemming filter.
+    *
diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h
new file mode 100644
index 000000000000..7de13249ac39
--- /dev/null
+++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h
@@ -0,0 +1,19 @@
+--- src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h	2013-04-01 18:25:10.000000000 -0400
+@@ -22,6 +22,7 @@
+ class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer {
+   TCHAR* language;
+   CLTCSetList* stopSet;
++  class SavedStreams;
+ 
+ public:
+   /** Builds the named analyzer with no stop words. */
+@@ -37,6 +38,8 @@
+       StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
+   TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+   TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader);
++
++  TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+ };
+ 
+ CL_NS_END2
diff --git a/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c
new file mode 100644
index 000000000000..0a7342740bf3
--- /dev/null
+++ b/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c
@@ -0,0 +1,24 @@
+--- src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c	2013-03-29 18:54:39.000000000 -0400
+@@ -35,9 +35,8 @@
+ {
+     stemmer_encoding enc;
+     struct stemmer_modules * module;
+-    struct sb_stemmer * stemmer =
+-	    (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+-    if (stemmer == NULL) return NULL;
++    struct sb_stemmer * stemmer;
++
+     enc = sb_getenc(charenc);
+     if (enc == ENC_UNKNOWN) return NULL;
+ 
+@@ -46,6 +45,9 @@
+     }
+     if (module->name == NULL) return NULL;
+     
++    stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
++    if (stemmer == NULL) return NULL;
++
+     stemmer->create = module->create;
+     stemmer->close = module->close;
+     stemmer->stem = module->stem;
diff --git a/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp
new file mode 100644
index 000000000000..87c0753cfeae
--- /dev/null
+++ b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp
@@ -0,0 +1,67 @@
+--- src/core/CLucene/util/BitSet.cpp.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/core/CLucene/util/BitSet.cpp	2013-03-29 17:57:05.000000000 -0400
+@@ -32,6 +32,25 @@
+     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+ 
++const uint8_t BitSet::BYTE_OFFSETS[256] = {
++    8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++    6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
++
++
+ BitSet::BitSet( const BitSet& copy ) :
+ 	_size( copy._size ),
+ 	_count(-1)
+@@ -180,19 +199,32 @@
+     return                            factor * (4 + (8+40)*count()) < size();
+   }
+ 
+-  int32_t BitSet::nextSetBit(int32_t fromIndex) const {
++  int32_t BitSet::nextSetBit(int32_t fromIndex) const 
++  {
+       if (fromIndex < 0)
+           _CLTHROWT(CL_ERR_IndexOutOfBounds, _T("fromIndex < 0"));
+ 
+       if (fromIndex >= _size)
+           return -1;
+ 
+-      while (true) {
+-          if ((bits[fromIndex >> 3] & (1 << (fromIndex & 7))) != 0)
+-              return fromIndex;
+-          if (++fromIndex == _size)
+-              return -1;
++      int _max = ( _size+7 ) >> 3;
++
++      unsigned int i = (int)( fromIndex>>3 );
++      unsigned int subIndex = fromIndex & 0x7; // index within the byte
++      uint8_t byte = bits[i] >> subIndex;  // skip all the bits to the right of index
++
++      if ( byte != 0 ) 
++      {
++          return ( ( i<<3 ) + subIndex + BYTE_OFFSETS[ byte ] );
++      }
++
++      while( ++i < _max ) 
++      {
++          byte = bits[i];
++          if ( byte != 0 ) 
++              return ( ( i<<3 ) + BYTE_OFFSETS[ byte ] );
+       }
++      return -1;
+   }
+ 
+ CL_NS_END
diff --git a/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h
new file mode 100644
index 000000000000..d648b4cab3c8
--- /dev/null
+++ b/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h
@@ -0,0 +1,10 @@
+--- src/core/CLucene/util/BitSet.h.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/core/CLucene/util/BitSet.h	2013-03-29 17:57:05.000000000 -0400
+@@ -39,6 +39,7 @@
+   /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
+   bool isSparse();
+   static const uint8_t BYTE_COUNTS[256];
++  static const uint8_t BYTE_OFFSETS[256];
+ protected:
+ 	BitSet( const BitSet& copy );
+
author	jkim <jkim@FreeBSD.org>	2013-04-17 02:37:03 +0800
committer	jkim <jkim@FreeBSD.org>	2013-04-17 02:37:03 +0800
commit	ec2d117cef25ff7a0cd174bf1a54c5daeec44031 (patch)
tree	4262e85bc4ab6c79b0216e8fc84488229d4e64f4 /textproc/clucene
parent	21cdbdb6903f4fbcb545ccd637d94c622c218f85 (diff)
download	freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.tar.gz freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.tar.zst freebsd-ports-gnome-ec2d117cef25ff7a0cd174bf1a54c5daeec44031.zip