diff options
author | yuri <yuri@FreeBSD.org> | 2018-12-08 12:26:45 +0800 |
---|---|---|
committer | yuri <yuri@FreeBSD.org> | 2018-12-08 12:26:45 +0800 |
commit | 59743feb6549e4d895aa48424fd36d09a8317e68 (patch) | |
tree | 78b5dbff33f4469bde3557a0674fa8f5f3e16b82 /textproc | |
parent | 076802b3eb932586afbc4c072e13861cbbad079a (diff) | |
download | freebsd-ports-gnome-59743feb6549e4d895aa48424fd36d09a8317e68.tar.gz freebsd-ports-gnome-59743feb6549e4d895aa48424fd36d09a8317e68.tar.zst freebsd-ports-gnome-59743feb6549e4d895aa48424fd36d09a8317e68.zip |
New ports: textproc/ucto, textproc/uctodata: Advanced rule-based (regular-expression) and unicode-aware tokenizer and its data port
Diffstat (limited to 'textproc')
-rw-r--r-- | textproc/Makefile | 2 | ||||
-rw-r--r-- | textproc/ucto/Makefile | 33 | ||||
-rw-r--r-- | textproc/ucto/distinfo | 3 | ||||
-rw-r--r-- | textproc/ucto/files/patch-config_Makefile.am | 12 | ||||
-rw-r--r-- | textproc/ucto/pkg-descr | 10 | ||||
-rw-r--r-- | textproc/ucto/pkg-plist | 12 | ||||
-rw-r--r-- | textproc/uctodata/Makefile | 23 | ||||
-rw-r--r-- | textproc/uctodata/distinfo | 3 | ||||
-rw-r--r-- | textproc/uctodata/pkg-descr | 4 | ||||
-rw-r--r-- | textproc/uctodata/pkg-plist | 33 |
10 files changed, 135 insertions, 0 deletions
diff --git a/textproc/Makefile b/textproc/Makefile index cb8d7b07a1c7..30e0f160a250 100644 --- a/textproc/Makefile +++ b/textproc/Makefile @@ -1789,6 +1789,8 @@ SUBDIR += txt2man SUBDIR += txt2tags SUBDIR += uchardet + SUBDIR += ucto + SUBDIR += uctodata SUBDIR += uim SUBDIR += uim-el SUBDIR += uim-gtk diff --git a/textproc/ucto/Makefile b/textproc/ucto/Makefile new file mode 100644 index 000000000000..13793707aeb6 --- /dev/null +++ b/textproc/ucto/Makefile @@ -0,0 +1,33 @@ +# $FreeBSD$ + +PORTNAME= ucto +DISTVERSIONPREFIX= v +DISTVERSION= 0.14 +CATEGORIES= textproc + +MAINTAINER= yuri@FreeBSD.org +COMMENT= Advanced rule-based (regular-expression) and unicode-aware tokenizer + +LICENSE= APACHE20 +LICENSE_FILE= ${WRKSRC}/COPYING + +BUILD_DEPENDS= autoconf-archive>0:devel/autoconf-archive \ + uctodata>0:textproc/uctodata +LIB_DEPENDS= libexttextcat-2.0.so:textproc/libexttextcat \ + libfolia.so:textproc/libfolia \ + libicuio.so:devel/icu \ + libomp.so:devel/openmp \ + libticcutils.so:devel/ticcutils +RUN_DEPENDS= uctodata>0:textproc/uctodata + +USES= autoreconf gmake gnome libedit libtool pkgconfig readline +GNU_CONFIGURE= yes +CONFIGURE_ARGS= --disable-static +USE_GITHUB= yes +GH_ACCOUNT= LanguageMachines +USE_GNOME= libxml2 +USE_LDCONFIG= yes + +INSTALL_TARGET= install-strip + +.include <bsd.port.mk> diff --git a/textproc/ucto/distinfo b/textproc/ucto/distinfo new file mode 100644 index 000000000000..0123227cea86 --- /dev/null +++ b/textproc/ucto/distinfo @@ -0,0 +1,3 @@ +TIMESTAMP = 1544204678 +SHA256 (LanguageMachines-ucto-v0.14_GH0.tar.gz) = ba40c28b0baba4eef98f88abc7c894a4b6fbaf153eaacd2ea3c9c177b0e85ea5 +SIZE (LanguageMachines-ucto-v0.14_GH0.tar.gz) = 350837 diff --git a/textproc/ucto/files/patch-config_Makefile.am b/textproc/ucto/files/patch-config_Makefile.am new file mode 100644 index 000000000000..d27cdd1baefc --- /dev/null +++ b/textproc/ucto/files/patch-config_Makefile.am @@ -0,0 +1,12 @@ +--- config/Makefile.am.orig 2018-12-08 03:11:07 UTC ++++ config/Makefile.am +@@ -7,7 +7,7 @@ EXTRA_DIST = $(config_DATA) + install-data-hook: + rm -f $(configdir)/textcat.cfg + if OLD_LM +- $(LN_S) $(configdir)/textcat_alt.cfg $(configdir)/textcat.cfg ++ cd $(DESTDIR)$(configdir) && $(LN_S) textcat_alt.cfg textcat.cfg + else +- $(LN_S) $(configdir)/textcat_normal.cfg $(configdir)/textcat.cfg ++ cd $(DESTDIR)$(configdir) && $(LN_S) textcat_normal.cfg textcat.cfg + endif diff --git a/textproc/ucto/pkg-descr b/textproc/ucto/pkg-descr new file mode 100644 index 000000000000..c731cb067058 --- /dev/null +++ b/textproc/ucto/pkg-descr @@ -0,0 +1,10 @@ +Ucto tokenizes text files: it separates words from punctuation, and splits +sentences. It offers several other basic preprocessing steps such as changing +case that you can all use to make your text suited for further processing such +as indexing, part-of-speech tagging, or machine translation. + +Ucto comes with tokenisation rules for several languages and can be easily +extended to suit other languages. It has been incorporated for tokenizing Dutch +text in Frog, our Dutch morpho-syntactic processor. + +WWW: https://languagemachines.github.io/ucto/ diff --git a/textproc/ucto/pkg-plist b/textproc/ucto/pkg-plist new file mode 100644 index 000000000000..8364aed5662d --- /dev/null +++ b/textproc/ucto/pkg-plist @@ -0,0 +1,12 @@ +bin/ucto +include/ucto/my_textcat.h +include/ucto/setting.h +include/ucto/tokenize.h +lib/libucto.so +lib/libucto.so.3 +lib/libucto.so.%%GTK3_VERSION%% +libdata/pkgconfig/ucto.pc +man/man1/ucto.1.gz +%%DATADIR%%/textcat.cfg +%%DATADIR%%/textcat_alt.cfg +%%DATADIR%%/textcat_normal.cfg diff --git a/textproc/uctodata/Makefile b/textproc/uctodata/Makefile new file mode 100644 index 000000000000..d93728a13f84 --- /dev/null +++ b/textproc/uctodata/Makefile @@ -0,0 +1,23 @@ +# $FreeBSD$ + +PORTNAME= uctodata +DISTVERSIONPREFIX= v +DISTVERSION= 0.8 +CATEGORIES= textproc + +MAINTAINER= yuri@FreeBSD.org +COMMENT= Datafiles for the tokenizer 'ucto' + +LICENSE= APACHE20 +LICENSE_FILE= ${WRKSRC}/COPYING + +USES= autoreconf gmake +GNU_CONFIGURE= yes +USE_GITHUB= yes +GH_ACCOUNT= LanguageMachines + +NO_ARCH= yes + +DATADIR= ${PREFIX}/share/ucto + +.include <bsd.port.mk> diff --git a/textproc/uctodata/distinfo b/textproc/uctodata/distinfo new file mode 100644 index 000000000000..14ac076b6795 --- /dev/null +++ b/textproc/uctodata/distinfo @@ -0,0 +1,3 @@ +TIMESTAMP = 1544225721 +SHA256 (LanguageMachines-uctodata-v0.8_GH0.tar.gz) = a8e5e69696facbd2c2251406560762cf7f4817620179e4a8ee8d241cf0371a5e +SIZE (LanguageMachines-uctodata-v0.8_GH0.tar.gz) = 37527 diff --git a/textproc/uctodata/pkg-descr b/textproc/uctodata/pkg-descr new file mode 100644 index 000000000000..aab02567a2ee --- /dev/null +++ b/textproc/uctodata/pkg-descr @@ -0,0 +1,4 @@ +Datafiles for ucto, the rule-based tokenization package that is used to +parse texts in different languages. + +WWW: https://languagemachines.github.io/ucto/ diff --git a/textproc/uctodata/pkg-plist b/textproc/uctodata/pkg-plist new file mode 100644 index 000000000000..8f93d4a2e507 --- /dev/null +++ b/textproc/uctodata/pkg-plist @@ -0,0 +1,33 @@ +libdata/pkgconfig/uctodata.pc +%%DATADIR%%/e-mail.rule +%%DATADIR%%/exotic-eos.eos +%%DATADIR%%/exotic-quotes.quote +%%DATADIR%%/fra.abr +%%DATADIR%%/fry.abr +%%DATADIR%%/ligatures.filter +%%DATADIR%%/nld_afk.abr +%%DATADIR%%/por.abr +%%DATADIR%%/rus.abr +%%DATADIR%%/smiley.rule +%%DATADIR%%/spa.abr +%%DATADIR%%/standard-eos.eos +%%DATADIR%%/standard-quotes.quote +%%DATADIR%%/swe.abr +%%DATADIR%%/tokconfig-deu +%%DATADIR%%/tokconfig-eng +%%DATADIR%%/tokconfig-fra +%%DATADIR%%/tokconfig-fry +%%DATADIR%%/tokconfig-generic +%%DATADIR%%/tokconfig-ita +%%DATADIR%%/tokconfig-nld +%%DATADIR%%/tokconfig-nld-historical +%%DATADIR%%/tokconfig-nld-sonarchat +%%DATADIR%%/tokconfig-nld-twitter +%%DATADIR%%/tokconfig-nld-withplaceholder +%%DATADIR%%/tokconfig-por +%%DATADIR%%/tokconfig-rus +%%DATADIR%%/tokconfig-spa +%%DATADIR%%/tokconfig-swe +%%DATADIR%%/tokconfig-tur +%%DATADIR%%/tur.abr +%%DATADIR%%/url.rule |