aboutsummaryrefslogtreecommitdiffstats
path: root/textproc
diff options
context:
space:
mode:
authoryuri <yuri@FreeBSD.org>2018-12-08 12:26:45 +0800
committeryuri <yuri@FreeBSD.org>2018-12-08 12:26:45 +0800
commit59743feb6549e4d895aa48424fd36d09a8317e68 (patch)
tree78b5dbff33f4469bde3557a0674fa8f5f3e16b82 /textproc
parent076802b3eb932586afbc4c072e13861cbbad079a (diff)
downloadfreebsd-ports-gnome-59743feb6549e4d895aa48424fd36d09a8317e68.tar.gz
freebsd-ports-gnome-59743feb6549e4d895aa48424fd36d09a8317e68.tar.zst
freebsd-ports-gnome-59743feb6549e4d895aa48424fd36d09a8317e68.zip
New ports: textproc/ucto, textproc/uctodata: Advanced rule-based (regular-expression) and unicode-aware tokenizer and its data port
Diffstat (limited to 'textproc')
-rw-r--r--textproc/Makefile2
-rw-r--r--textproc/ucto/Makefile33
-rw-r--r--textproc/ucto/distinfo3
-rw-r--r--textproc/ucto/files/patch-config_Makefile.am12
-rw-r--r--textproc/ucto/pkg-descr10
-rw-r--r--textproc/ucto/pkg-plist12
-rw-r--r--textproc/uctodata/Makefile23
-rw-r--r--textproc/uctodata/distinfo3
-rw-r--r--textproc/uctodata/pkg-descr4
-rw-r--r--textproc/uctodata/pkg-plist33
10 files changed, 135 insertions, 0 deletions
diff --git a/textproc/Makefile b/textproc/Makefile
index cb8d7b07a1c7..30e0f160a250 100644
--- a/textproc/Makefile
+++ b/textproc/Makefile
@@ -1789,6 +1789,8 @@
SUBDIR += txt2man
SUBDIR += txt2tags
SUBDIR += uchardet
+ SUBDIR += ucto
+ SUBDIR += uctodata
SUBDIR += uim
SUBDIR += uim-el
SUBDIR += uim-gtk
diff --git a/textproc/ucto/Makefile b/textproc/ucto/Makefile
new file mode 100644
index 000000000000..13793707aeb6
--- /dev/null
+++ b/textproc/ucto/Makefile
@@ -0,0 +1,33 @@
+# $FreeBSD$
+
+PORTNAME= ucto
+DISTVERSIONPREFIX= v
+DISTVERSION= 0.14
+CATEGORIES= textproc
+
+MAINTAINER= yuri@FreeBSD.org
+COMMENT= Advanced rule-based (regular-expression) and unicode-aware tokenizer
+
+LICENSE= APACHE20
+LICENSE_FILE= ${WRKSRC}/COPYING
+
+BUILD_DEPENDS= autoconf-archive>0:devel/autoconf-archive \
+ uctodata>0:textproc/uctodata
+LIB_DEPENDS= libexttextcat-2.0.so:textproc/libexttextcat \
+ libfolia.so:textproc/libfolia \
+ libicuio.so:devel/icu \
+ libomp.so:devel/openmp \
+ libticcutils.so:devel/ticcutils
+RUN_DEPENDS= uctodata>0:textproc/uctodata
+
+USES= autoreconf gmake gnome libedit libtool pkgconfig readline
+GNU_CONFIGURE= yes
+CONFIGURE_ARGS= --disable-static
+USE_GITHUB= yes
+GH_ACCOUNT= LanguageMachines
+USE_GNOME= libxml2
+USE_LDCONFIG= yes
+
+INSTALL_TARGET= install-strip
+
+.include <bsd.port.mk>
diff --git a/textproc/ucto/distinfo b/textproc/ucto/distinfo
new file mode 100644
index 000000000000..0123227cea86
--- /dev/null
+++ b/textproc/ucto/distinfo
@@ -0,0 +1,3 @@
+TIMESTAMP = 1544204678
+SHA256 (LanguageMachines-ucto-v0.14_GH0.tar.gz) = ba40c28b0baba4eef98f88abc7c894a4b6fbaf153eaacd2ea3c9c177b0e85ea5
+SIZE (LanguageMachines-ucto-v0.14_GH0.tar.gz) = 350837
diff --git a/textproc/ucto/files/patch-config_Makefile.am b/textproc/ucto/files/patch-config_Makefile.am
new file mode 100644
index 000000000000..d27cdd1baefc
--- /dev/null
+++ b/textproc/ucto/files/patch-config_Makefile.am
@@ -0,0 +1,12 @@
+--- config/Makefile.am.orig 2018-12-08 03:11:07 UTC
++++ config/Makefile.am
+@@ -7,7 +7,7 @@ EXTRA_DIST = $(config_DATA)
+ install-data-hook:
+ rm -f $(configdir)/textcat.cfg
+ if OLD_LM
+- $(LN_S) $(configdir)/textcat_alt.cfg $(configdir)/textcat.cfg
++ cd $(DESTDIR)$(configdir) && $(LN_S) textcat_alt.cfg textcat.cfg
+ else
+- $(LN_S) $(configdir)/textcat_normal.cfg $(configdir)/textcat.cfg
++ cd $(DESTDIR)$(configdir) && $(LN_S) textcat_normal.cfg textcat.cfg
+ endif
diff --git a/textproc/ucto/pkg-descr b/textproc/ucto/pkg-descr
new file mode 100644
index 000000000000..c731cb067058
--- /dev/null
+++ b/textproc/ucto/pkg-descr
@@ -0,0 +1,10 @@
+Ucto tokenizes text files: it separates words from punctuation, and splits
+sentences. It offers several other basic preprocessing steps such as changing
+case that you can all use to make your text suited for further processing such
+as indexing, part-of-speech tagging, or machine translation.
+
+Ucto comes with tokenisation rules for several languages and can be easily
+extended to suit other languages. It has been incorporated for tokenizing Dutch
+text in Frog, our Dutch morpho-syntactic processor.
+
+WWW: https://languagemachines.github.io/ucto/
diff --git a/textproc/ucto/pkg-plist b/textproc/ucto/pkg-plist
new file mode 100644
index 000000000000..8364aed5662d
--- /dev/null
+++ b/textproc/ucto/pkg-plist
@@ -0,0 +1,12 @@
+bin/ucto
+include/ucto/my_textcat.h
+include/ucto/setting.h
+include/ucto/tokenize.h
+lib/libucto.so
+lib/libucto.so.3
+lib/libucto.so.%%GTK3_VERSION%%
+libdata/pkgconfig/ucto.pc
+man/man1/ucto.1.gz
+%%DATADIR%%/textcat.cfg
+%%DATADIR%%/textcat_alt.cfg
+%%DATADIR%%/textcat_normal.cfg
diff --git a/textproc/uctodata/Makefile b/textproc/uctodata/Makefile
new file mode 100644
index 000000000000..d93728a13f84
--- /dev/null
+++ b/textproc/uctodata/Makefile
@@ -0,0 +1,23 @@
+# $FreeBSD$
+
+PORTNAME= uctodata
+DISTVERSIONPREFIX= v
+DISTVERSION= 0.8
+CATEGORIES= textproc
+
+MAINTAINER= yuri@FreeBSD.org
+COMMENT= Datafiles for the tokenizer 'ucto'
+
+LICENSE= APACHE20
+LICENSE_FILE= ${WRKSRC}/COPYING
+
+USES= autoreconf gmake
+GNU_CONFIGURE= yes
+USE_GITHUB= yes
+GH_ACCOUNT= LanguageMachines
+
+NO_ARCH= yes
+
+DATADIR= ${PREFIX}/share/ucto
+
+.include <bsd.port.mk>
diff --git a/textproc/uctodata/distinfo b/textproc/uctodata/distinfo
new file mode 100644
index 000000000000..14ac076b6795
--- /dev/null
+++ b/textproc/uctodata/distinfo
@@ -0,0 +1,3 @@
+TIMESTAMP = 1544225721
+SHA256 (LanguageMachines-uctodata-v0.8_GH0.tar.gz) = a8e5e69696facbd2c2251406560762cf7f4817620179e4a8ee8d241cf0371a5e
+SIZE (LanguageMachines-uctodata-v0.8_GH0.tar.gz) = 37527
diff --git a/textproc/uctodata/pkg-descr b/textproc/uctodata/pkg-descr
new file mode 100644
index 000000000000..aab02567a2ee
--- /dev/null
+++ b/textproc/uctodata/pkg-descr
@@ -0,0 +1,4 @@
+Datafiles for ucto, the rule-based tokenization package that is used to
+parse texts in different languages.
+
+WWW: https://languagemachines.github.io/ucto/
diff --git a/textproc/uctodata/pkg-plist b/textproc/uctodata/pkg-plist
new file mode 100644
index 000000000000..8f93d4a2e507
--- /dev/null
+++ b/textproc/uctodata/pkg-plist
@@ -0,0 +1,33 @@
+libdata/pkgconfig/uctodata.pc
+%%DATADIR%%/e-mail.rule
+%%DATADIR%%/exotic-eos.eos
+%%DATADIR%%/exotic-quotes.quote
+%%DATADIR%%/fra.abr
+%%DATADIR%%/fry.abr
+%%DATADIR%%/ligatures.filter
+%%DATADIR%%/nld_afk.abr
+%%DATADIR%%/por.abr
+%%DATADIR%%/rus.abr
+%%DATADIR%%/smiley.rule
+%%DATADIR%%/spa.abr
+%%DATADIR%%/standard-eos.eos
+%%DATADIR%%/standard-quotes.quote
+%%DATADIR%%/swe.abr
+%%DATADIR%%/tokconfig-deu
+%%DATADIR%%/tokconfig-eng
+%%DATADIR%%/tokconfig-fra
+%%DATADIR%%/tokconfig-fry
+%%DATADIR%%/tokconfig-generic
+%%DATADIR%%/tokconfig-ita
+%%DATADIR%%/tokconfig-nld
+%%DATADIR%%/tokconfig-nld-historical
+%%DATADIR%%/tokconfig-nld-sonarchat
+%%DATADIR%%/tokconfig-nld-twitter
+%%DATADIR%%/tokconfig-nld-withplaceholder
+%%DATADIR%%/tokconfig-por
+%%DATADIR%%/tokconfig-rus
+%%DATADIR%%/tokconfig-spa
+%%DATADIR%%/tokconfig-swe
+%%DATADIR%%/tokconfig-tur
+%%DATADIR%%/tur.abr
+%%DATADIR%%/url.rule