aboutsummaryrefslogtreecommitdiffstats
path: root/e-util/e-html-utils.c
blob: 12aee18d2b99f3b73cb5687a2a09faf0326309de (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153<# Ports collection makefile for:  pecl-paradox
# Date created:           2006-02-11
# Whom:               Alexander Zhuravlev <zaa@zaa.pp.ru>
#
# $FreeBSD$
#

PORTNAME=   paradox
DISTVERSION=    1.4.1
CATEGORIES= databases www
MASTER_SITES=   http://pecl.php.net/get/
PKGNAMEPREFIX=  pecl-
EXTRACT_SUFX=   .tgz
DIST_SUBDIR=    PECL

MAINTAINER= zaa@zaa.pp.ru
COMMENT=    An extension to read and write Paradox files

LIB_DEPENDS=    px.6:${PORTSDIR}/databases/pxlib

USE_PHP=    yes
USE_PHPIZE= yes
USE_PHPEXT= yes
DEFAULT_PHP_VER=5
BROKEN_WITH_PHP=4
PHP_MODNAME=    paradox

CONFIGURE_ARGS+=    --with-paradox=${LOCALBASE}
CONFIGURE_ENV=      CPPFLAGS=-I${LOCALBASE}/include LDFLAGS=-L${LOCALBASE}/lib

post-patch:
    @${REINPLACE_CMD} -E 's/(lib -lm) -ldl/\1 -liconv -lintl/g' \
        ${WRKSRC}/config.m4

.include <bsd.port.mk>
a id='n241' href='#n241'>241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/* e-html-utils.c
 * Copyright (C) 2000  Ximian, Inc.
 * Author: Dan Winship <danw@ximian.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <glib.h>

#include "e-html-utils.h"

static char *
check_size (char **buffer, int *buffer_size, char *out, int len)
{
    if (out + len + 1> *buffer + *buffer_size) {
        int index = out - *buffer;

        *buffer_size = MAX (index + len + 1, *buffer_size * 2);
        *buffer = g_realloc (*buffer, *buffer_size);
        out = *buffer + index;
    }
    return out;
}

/* 1 = non-email-address chars: ()<>@,;:\"[]`'{}| */
/* 2 = trailing url garbage:    ,.!?;:>)]}`'-_|   */
/* 4 = dns chars                                  */
static int special_chars[] = {
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,    /*  nul - 0x0f */
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,    /* 0x10 - 0x1f */
    1, 2, 1, 0, 0, 0, 0, 3, 1, 3, 0, 0, 3, 6, 6, 0,    /*   sp - /    */
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 1, 0, 3, 2,    /*    0 - ?    */
    1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,    /*    @ - O    */
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 3, 0, 2,    /*    P - _    */
    3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,    /*    ` - o    */
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 3, 3, 0, 3     /*    p - del  */
};

#define is_addr_char(c) (c < 128 && !(special_chars[c] & 1))
#define is_trailing_garbage(c) (c > 127 || (special_chars[c] & 2))
#define is_domain_name_char(c) (c < 128 && (special_chars[c] & 4))

static char *
url_extract (const unsigned char **text, gboolean check)
{
    const unsigned char *end = *text, *p;
    char *out;

    while (*end && !isspace (*end) && (*end != '"') && (*end < 0x80))
        end++;

    /* Back up if we probably went too far. */
    while (end > *text && is_trailing_garbage (*(end - 1)))
        end--;

    if (check) {
        /* Make sure we weren't fooled. */
        p = memchr (*text, ':', end - *text);
        if (!p || end - p < 4)
            return NULL;
    }

    out = g_strndup (*text, end - *text);
    *text = end;
    return out;
}

static char *
email_address_extract (const unsigned char **cur, char **out, const unsigned char *linestart)
{
    const unsigned char *start, *end, *dot;
    char *addr;

    /* *cur points to the '@'. Look backward for a valid local-part */
    for (start = *cur; start - 1 >= linestart && is_addr_char (*(start - 1)); start--)
        ;
    if (start == *cur)
        return NULL;

    /* Now look forward for a valid domain part */
    for (end = *cur + 1, dot = NULL; is_domain_name_char (*end); end++) {
        if (*end == '.' && !dot)
            dot = end;
    }
    if (!dot)
        return NULL;

    /* Remove trailing garbage */
    while (is_trailing_garbage (*(end - 1)))
        end--;
    if (dot > end)
        return NULL;

    addr = g_strndup (start, end - start);
    *out -= *cur - start;
    *cur = end;

    return addr;
}

static gboolean
is_citation (const unsigned char *c, gboolean saw_citation)
{
    const unsigned char *p;

    if (*c != '>')
        return FALSE;

    /* A line that starts with a ">" is a citation, unless it's
     * just mbox From-mangling...
     */
    if (strncmp (c, ">From ", 6) != 0)
        return TRUE;

    /* If the previous line was a citation, then say this
     * one is too.
     */
    if (saw_citation)
        return TRUE;

    /* Same if the next line is */
    p = (const unsigned char *)strchr ((const char *)c, '\n');
    if (p && *++p == '>')
        return TRUE;

    /* Otherwise, it was just an isolated ">From" line. */
    return FALSE;
}

/**
 * e_text_to_html_full:
 * @input: a NUL-terminated input buffer
 * @flags: some combination of the E_TEXT_TO_HTML_* flags defined
 * in e-html-utils.h
 * @color: color for citation highlighting
 *
 * This takes a buffer of text as input and produces a buffer of
 * "equivalent" HTML, subject to certain transformation rules.
 *
 * The set of possible flags is:
 *
 *   - E_TEXT_TO_HTML_PRE: wrap the output HTML in <PRE> and </PRE>.
 *     Should only be used if @input is the entire buffer to be
 *     converted. If e_text_to_html is being called with small pieces
 *     of data, you should wrap the entire result in <PRE> yourself.
 *
 *   - E_TEXT_TO_HTML_CONVERT_NL: convert "\n" to "<BR>\n" on output.
 *     (should not be used with E_TEXT_TO_HTML_PRE, since that would
 *     result in double-newlines).
 *
 *   - E_TEXT_TO_HTML_CONVERT_SPACES: convert a block of N spaces
 *     into N-1 non-breaking spaces and one normal space. A space
 *     at the start of the buffer is always converted to a
 *     non-breaking space, regardless of the following character,
 *     which probably means you don't want to use this flag on
 *     pieces of data that aren't delimited by at least line breaks.
 *
 *     If E_TEXT_TO_HTML_CONVERT_NL and E_TEXT_TO_HTML_CONVERT_SPACES
 *     are both defined, then TABs will also be converted to spaces.
 *
 *   - E_TEXT_TO_HTML_CONVERT_URLS: wrap <a href="..."> </a> around
 *     strings that look like URLs.
 *
 *   - E_TEXT_TO_HTML_CONVERT_ADDRESSES: wrap <a href="mailto:..."> </a> around
 *     strings that look like mail addresses.
 *
 *   - E_TEXT_TO_HTML_MARK_CITATION: wrap <font color="..."> </font> around
 *     citations (lines beginning with "> ", etc).
 *
 *   - E_TEXT_TO_HTML_ESCAPE_8BIT: flatten everything to US-ASCII
 *
 *   - E_TEXT_TO_HTML_CITE: quote the text with "> " at the start of each
 *     line.
 **/
char *
e_text_to_html_full (const char *input, unsigned int flags, guint32 color)
{
    const unsigned char *cur, *next, *linestart;
    char *buffer = NULL;
    char *out = NULL;
    int buffer_size = 0, col;
    gboolean colored = FALSE, saw_citation = FALSE;

    /* Allocate a translation buffer.  */
    buffer_size = strlen (input) * 2 + 5;
    buffer = g_malloc (buffer_size);

    out = buffer;
    if (flags & E_TEXT_TO_HTML_PRE)
        out += sprintf (out, "<PRE>");

    col = 0;

    for (cur = linestart = input; cur && *cur; cur = next) {
        gunichar u;

        if (flags & E_TEXT_TO_HTML_MARK_CITATION && col == 0) {
            saw_citation = is_citation (cur, saw_citation);
            if (saw_citation) {
                if (!colored) {
                    gchar font [25];

                    g_snprintf (font, 25, "<FONT COLOR=\"#%06x\">", color);

                    out = check_size (&buffer, &buffer_size, out, 25);
                    out += sprintf (out, "%s", font);
                    colored = TRUE;
                }
            } else if (colored) {
                gchar *no_font = "</FONT>";

                out = check_size (&buffer, &buffer_size, out, 9);
                out += sprintf (out, "%s", no_font);
                colored = FALSE;
            }

            /* Display mbox-mangled ">From" as "From" */
            if (*cur == '>' && !saw_citation)
                cur++;
        } else if (flags & E_TEXT_TO_HTML_CITE && col == 0) {
            out = check_size (&buffer, &buffer_size, out, 5);
            out += sprintf (out, "&gt; ");
        }

        u = g_utf8_get_char (cur);
        if (g_unichar_isalpha (u) &&
            (flags & E_TEXT_TO_HTML_CONVERT_URLS)) {
            char *tmpurl = NULL, *refurl = NULL, *dispurl = NULL;

            if (!strncasecmp (cur, "http://", 7) ||
                !strncasecmp (cur, "https://", 8) ||
                !strncasecmp (cur, "ftp://", 6) ||
                !strncasecmp (cur, "nntp://", 7) ||
                !strncasecmp (cur, "mailto:", 7) ||
                !strncasecmp (cur, "news:", 5) ||
                !strncasecmp (cur, "file:", 5)) {
                tmpurl = url_extract (&cur, TRUE);
                if (tmpurl) {
                    refurl = e_text_to_html (tmpurl, 0);
                    dispurl = g_strdup (refurl);
                }
            } else if (!strncasecmp (cur, "www.", 4) &&
                   (*(cur + 4) < 0x80) &&
                   g_unichar_isalnum (*(cur + 4))) {
                tmpurl = url_extract (&cur, FALSE);
                dispurl = e_text_to_html (tmpurl, 0);
                refurl = g_strdup_printf ("http://%s",
                              dispurl);
            }

            if (tmpurl) {
                out = check_size (&buffer, &buffer_size, out,
                          strlen (refurl) +
                          strlen (dispurl) + 15);
                out += sprintf (out,
                        "<a href=\"%s\">%s</a>",
                        refurl, dispurl);
                col += strlen (tmpurl);
                g_free (tmpurl);
                g_free (refurl);
                g_free (dispurl);
            }

            if (!*cur)
                break;
            u = g_utf8_get_char (cur);
        }

        if (u == '@' && (flags & E_TEXT_TO_HTML_CONVERT_ADDRESSES)) {
            char *addr, *dispaddr, *outaddr;

            addr = email_address_extract (&cur, &out, linestart);
            if (addr) {
                dispaddr = e_text_to_html (addr, 0);
                outaddr = g_strdup_printf ("<a href=\"mailto:%s\">%s</a>",
                               addr, dispaddr);
                out = check_size (&buffer, &buffer_size, out, strlen (outaddr));
                out += sprintf (out, "%s", outaddr);
                col += strlen (addr);
                g_free (addr);
                g_free (dispaddr);
                g_free (outaddr);

                if (!*cur)
                    break;
                u = g_utf8_get_char (cur);
            }
        }

        if (!g_unichar_validate (u)) {
            /* Sigh. Someone sent undeclared 8-bit data.
             * Assume it's iso-8859-1.
             */
            u = *cur;
            next = cur + 1;
        } else
            next = g_utf8_next_char (cur);

        out = check_size (&buffer, &buffer_size, out, 10);

        switch (u) {
        case '<':
            strcpy (out, "&lt;");
            out += 4;
            col++;
            break;

        case '>':
            strcpy (out, "&gt;");
            out += 4;
            col++;
            break;

        case '&':
            strcpy (out, "&amp;");
            out += 5;
            col++;
            break;

        case '"':
            strcpy (out, "&quot;");
            out += 6;
            col++;
            break;

        case '\n':
            if (flags & E_TEXT_TO_HTML_CONVERT_NL) {
                strcpy (out, "<br>");
                out += 4;
            }
            *out++ = *cur;
            linestart = cur;
            col = 0;
            break;

        case '\t':
            if (flags & (E_TEXT_TO_HTML_CONVERT_SPACES |
                     E_TEXT_TO_HTML_CONVERT_NL)) {
                do {
                    out = check_size (&buffer, &buffer_size,
                            out, 7);
                    strcpy (out, "&nbsp;");
                    out += 6;
                    col++;
                } while (col % 8);
                break;
            }
            /* otherwise, FALL THROUGH */

        case ' ':
            if (flags & E_TEXT_TO_HTML_CONVERT_SPACES) {
                if (cur == (const unsigned char *)input ||
                    *(cur + 1) == ' ' || *(cur + 1) == '\t' ||
                    *(cur - 1) == '\n') {
                    strcpy (out, "&nbsp;");
                    out += 6;
                    col++;
                    break;
                }
            }
            /* otherwise, FALL THROUGH */

        default:
            if ((u >= 0x20 && u < 0x80) ||
                (u == '\r' || u == '\t')) {
                /* Default case, just copy. */
                *out++ = u;
            } else {
                if (flags & E_TEXT_TO_HTML_ESCAPE_8BIT)
                    *out++ = '?';
                else
                    out += g_snprintf(out, 9, "&#%d;", u);
            }
            col++;
            break;
        }
    }

    out = check_size (&buffer, &buffer_size, out, 7);
    if (flags & E_TEXT_TO_HTML_PRE)
        strcpy (out, "</PRE>");
    else
        *out = '\0';

    return buffer;
}

char *
e_text_to_html (const char *input, unsigned int flags)
{
    return e_text_to_html_full (input, flags, 0);
}