aboutsummaryrefslogtreecommitdiffstats
path: root/textproc
diff options
context:
space:
mode:
authorthierry <thierry@FreeBSD.org>2006-12-05 05:45:23 +0800
committerthierry <thierry@FreeBSD.org>2006-12-05 05:45:23 +0800
commitb693421c2fa2bff3834332e3de13e72b497cf119 (patch)
treeed5c714c1bb8cb59d612d645403c027dd45d13eb /textproc
parentcde15a8ba05ec131b187db705c55885faebf2216 (diff)
downloadfreebsd-ports-gnome-b693421c2fa2bff3834332e3de13e72b497cf119.tar.gz
freebsd-ports-gnome-b693421c2fa2bff3834332e3de13e72b497cf119.tar.zst
freebsd-ports-gnome-b693421c2fa2bff3834332e3de13e72b497cf119.zip
Libtextcat is a library with functions that implement the classification
technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization". It was primarily developed for language guessing, a task on which it is known to perform with near-perfect accuracy. WWW: http://software.wise-guys.nl/libtextcat/
Diffstat (limited to 'textproc')
-rw-r--r--textproc/Makefile1
-rw-r--r--textproc/libtextcat/Makefile39
-rw-r--r--textproc/libtextcat/distinfo3
-rw-r--r--textproc/libtextcat/files/patch-src_Makefile.in11
-rw-r--r--textproc/libtextcat/pkg-descr17
-rw-r--r--textproc/libtextcat/pkg-plist85
6 files changed, 156 insertions, 0 deletions
diff --git a/textproc/Makefile b/textproc/Makefile
index bd5f2617fb30..47cc6e6a60d3 100644
--- a/textproc/Makefile
+++ b/textproc/Makefile
@@ -248,6 +248,7 @@
SUBDIR += libstree
SUBDIR += libtext-charwidth-perl
SUBDIR += libtext-wrapi18n-perl
+ SUBDIR += libtextcat
SUBDIR += libtranslate
SUBDIR += libtre
SUBDIR += libuninameslist
diff --git a/textproc/libtextcat/Makefile b/textproc/libtextcat/Makefile
new file mode 100644
index 000000000000..9dbb3a0bafad
--- /dev/null
+++ b/textproc/libtextcat/Makefile
@@ -0,0 +1,39 @@
+# New ports collection makefile for: libtextcat
+# Date created: Sat 18 nov 2007
+# Whom: thierry@pompo.net
+#
+# $FreeBSD$
+#
+
+PORTNAME= libtextcat
+PORTVERSION= 2.2
+CATEGORIES= textproc
+MASTER_SITES= http://software.wise-guys.nl/download/
+
+MAINTAINER= thierry@FreeBSD.org
+COMMENT= Language guessing by N-Gram-Based Text Categorization
+
+GNU_CONFIGURE= yes
+CONFIGURE_TARGET= --build=${ARCH}-portbld-freebsd${OSREL}
+USE_LDCONFIG= yes
+
+PORTDOCS= LICENSE README TODO
+
+post-install:
+ ${INSTALL_DATA} ${WRKSRC}/src/textcat.h ${PREFIX}/include/
+ ${MKDIR} ${DATADIR}/LM
+ @${ECHO_MSG} "Installing language models provided in Gertjan van Noord's TextCat package"
+ (cd ${WRKSRC}/langclass/LM && \
+ ${FIND} . -name "*.lm" -exec ${INSTALL_DATA} "{}" "${DATADIR}/LM/{}" \;)
+ ${INSTALL_DATA} ${WRKSRC}/langclass/conf.txt "${DATADIR}"
+ ${MKDIR} ${DOCSDIR}
+ ${INSTALL_DATA} ${PORTDOCS:S|^|${WRKSRC}/|} ${DOCSDIR}
+
+regression-test:
+ (cd ${WRKSRC}/langclass/ && \
+ for t in `${LS} ShortTexts/*.txt` ; do \
+ ${ECHO_MSG} "Analyzing $$t..." ; \
+ ../src/testtextcat conf.txt < $$t ; \
+ done)
+
+.include <bsd.port.mk>
diff --git a/textproc/libtextcat/distinfo b/textproc/libtextcat/distinfo
new file mode 100644
index 000000000000..00bb5f2dcbd5
--- /dev/null
+++ b/textproc/libtextcat/distinfo
@@ -0,0 +1,3 @@
+MD5 (libtextcat-2.2.tar.gz) = 128cfc86ed5953e57fe0f5ae98b62c2e
+SHA256 (libtextcat-2.2.tar.gz) = 5677badffc48a8d332e345ea4fe225e3577f53fc95deeec8306000b256829655
+SIZE (libtextcat-2.2.tar.gz) = 540999
diff --git a/textproc/libtextcat/files/patch-src_Makefile.in b/textproc/libtextcat/files/patch-src_Makefile.in
new file mode 100644
index 000000000000..835d7c67e038
--- /dev/null
+++ b/textproc/libtextcat/files/patch-src_Makefile.in
@@ -0,0 +1,11 @@
+--- src/Makefile.in.orig Thu May 22 13:39:52 2003
++++ src/Makefile.in Sat Nov 18 22:55:18 2006
+@@ -126,7 +126,7 @@
+
+ WARNS = -W -Wall -Wshadow -Wpointer-arith
+ IFLAGS =
+-FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
++FLAGS = -g -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
+ VERBOSE = -DVERBOSE
+ AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
+ AM_LDFLAGS = -g
diff --git a/textproc/libtextcat/pkg-descr b/textproc/libtextcat/pkg-descr
new file mode 100644
index 000000000000..c0a7e7660e16
--- /dev/null
+++ b/textproc/libtextcat/pkg-descr
@@ -0,0 +1,17 @@
+Libtextcat is a library with functions that implement the classification
+technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization" [1].
+It was primarily developed for language guessing, a task on which it is known to
+perform with near-perfect accuracy.
+
+The central idea of the Cavnar & Trenkle technique is to calculate a
+"fingerprint" of a document with an unknown category, and compare this with the
+fingerprints of a number of documents of which the categories are known. The
+categories of the closest matches are output as the classification. A
+fingerprint is a list of the most frequent n-grams occurring in a document,
+ordered by frequency. Fingerprints are compared with a simple out-of-place
+metric.
+
+[1] The document that started it all: William B. Cavnar & John M. Trenkle (1994)
+N-Gram-Based Text Categorization, <http://citeseer.ist.psu.edu/68861.html>.
+
+WWW: http://software.wise-guys.nl/libtextcat/
diff --git a/textproc/libtextcat/pkg-plist b/textproc/libtextcat/pkg-plist
new file mode 100644
index 000000000000..74a45fa7fd43
--- /dev/null
+++ b/textproc/libtextcat/pkg-plist
@@ -0,0 +1,85 @@
+bin/createfp
+include/textcat.h
+lib/libtextcat.a
+lib/libtextcat.la
+lib/libtextcat.so
+lib/libtextcat.so.0
+%%DATADIR%%/LM/afrikaans.lm
+%%DATADIR%%/LM/albanian.lm
+%%DATADIR%%/LM/amharic-utf.lm
+%%DATADIR%%/LM/arabic-iso8859_6.lm
+%%DATADIR%%/LM/arabic-windows1256.lm
+%%DATADIR%%/LM/armenian.lm
+%%DATADIR%%/LM/basque.lm
+%%DATADIR%%/LM/belarus-windows1251.lm
+%%DATADIR%%/LM/bosnian.lm
+%%DATADIR%%/LM/breton.lm
+%%DATADIR%%/LM/bulgarian-iso8859_5.lm
+%%DATADIR%%/LM/catalan.lm
+%%DATADIR%%/LM/chinese-big5.lm
+%%DATADIR%%/LM/chinese-gb2312.lm
+%%DATADIR%%/LM/croatian-ascii.lm
+%%DATADIR%%/LM/czech-iso8859_2.lm
+%%DATADIR%%/LM/danish.lm
+%%DATADIR%%/LM/drents.lm
+%%DATADIR%%/LM/dutch.lm
+%%DATADIR%%/LM/english.lm
+%%DATADIR%%/LM/esperanto.lm
+%%DATADIR%%/LM/estonian.lm
+%%DATADIR%%/LM/finnish.lm
+%%DATADIR%%/LM/french.lm
+%%DATADIR%%/LM/frisian.lm
+%%DATADIR%%/LM/georgian.lm
+%%DATADIR%%/LM/german.lm
+%%DATADIR%%/LM/greek-iso8859-7.lm
+%%DATADIR%%/LM/hebrew-iso8859_8.lm
+%%DATADIR%%/LM/hindi.lm
+%%DATADIR%%/LM/hungarian.lm
+%%DATADIR%%/LM/icelandic.lm
+%%DATADIR%%/LM/indonesian.lm
+%%DATADIR%%/LM/irish.lm
+%%DATADIR%%/LM/italian.lm
+%%DATADIR%%/LM/japanese-euc_jp.lm
+%%DATADIR%%/LM/japanese-shift_jis.lm
+%%DATADIR%%/LM/korean.lm
+%%DATADIR%%/LM/latin.lm
+%%DATADIR%%/LM/latvian.lm
+%%DATADIR%%/LM/lithuanian.lm
+%%DATADIR%%/LM/malay.lm
+%%DATADIR%%/LM/manx.lm
+%%DATADIR%%/LM/marathi.lm
+%%DATADIR%%/LM/middle_frisian.lm
+%%DATADIR%%/LM/mingo.lm
+%%DATADIR%%/LM/nepali.lm
+%%DATADIR%%/LM/norwegian.lm
+%%DATADIR%%/LM/persian.lm
+%%DATADIR%%/LM/polish.lm
+%%DATADIR%%/LM/portuguese.lm
+%%DATADIR%%/LM/quechua.lm
+%%DATADIR%%/LM/romanian.lm
+%%DATADIR%%/LM/rumantsch.lm
+%%DATADIR%%/LM/russian-iso8859_5.lm
+%%DATADIR%%/LM/russian-koi8_r.lm
+%%DATADIR%%/LM/russian-windows1251.lm
+%%DATADIR%%/LM/sanskrit.lm
+%%DATADIR%%/LM/scots.lm
+%%DATADIR%%/LM/scots_gaelic.lm
+%%DATADIR%%/LM/serbian-ascii.lm
+%%DATADIR%%/LM/slovak-ascii.lm
+%%DATADIR%%/LM/slovak-windows1250.lm
+%%DATADIR%%/LM/slovenian-ascii.lm
+%%DATADIR%%/LM/slovenian-iso8859_2.lm
+%%DATADIR%%/LM/spanish.lm
+%%DATADIR%%/LM/swahili.lm
+%%DATADIR%%/LM/swedish.lm
+%%DATADIR%%/LM/tagalog.lm
+%%DATADIR%%/LM/tamil.lm
+%%DATADIR%%/LM/thai.lm
+%%DATADIR%%/LM/turkish.lm
+%%DATADIR%%/LM/ukrainian-koi8_r.lm
+%%DATADIR%%/LM/vietnamese.lm
+%%DATADIR%%/LM/welsh.lm
+%%DATADIR%%/LM/yiddish-utf.lm
+%%DATADIR%%/conf.txt
+@dirrm %%DATADIR%%/LM
+@dirrm %%DATADIR%%