diff options
Diffstat (limited to 'e-util/ename/e-name-western.c')
-rw-r--r-- | e-util/ename/e-name-western.c | 956 |
1 files changed, 0 insertions, 956 deletions
diff --git a/e-util/ename/e-name-western.c b/e-util/ename/e-name-western.c deleted file mode 100644 index c016533d91..0000000000 --- a/e-util/ename/e-name-western.c +++ /dev/null @@ -1,956 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * A simple Western name parser. - * - * <Nat> Jamie, do you know anything about name parsing? - * <jwz> Are you going down that rat hole? Bring a flashlight. - * - * Authors: - * Nat Friedman <nat@ximian.com> - * - * Copyright 1999 - 2001, Ximian, Inc. - */ - -#include <ctype.h> -#include <string.h> -#include <glib.h> - -#include <ename/e-name-western.h> -#include <ename/e-name-western-tables.h> - -typedef struct { - int prefix_idx; - int first_idx; - int middle_idx; - int nick_idx; - int last_idx; - int suffix_idx; -} ENameWesternIdxs; - -static int -e_name_western_str_count_words (char *str) -{ - int word_count; - char *p; - - word_count = 0; - - for (p = str; p != NULL; p = strchr (p, ' ')) { - word_count ++; - p ++; - } - - return word_count; -} - -static void -e_name_western_cleanup_string (char **str) -{ - char *newstr; - char *p; - - if (*str == NULL) - return; - - /* skip any spaces and commas at the start of the string */ - p = *str; - while (isspace (*p) || *p == ',') - p ++; - - /* make the copy we're going to return */ - newstr = g_strdup (p); - - if ( strlen(newstr) > 0) { - /* now search from the back, skipping over any spaces and commas */ - p = newstr + strlen (newstr) - 1; - while (isspace (*p) || *p == ',') - p --; - /* advance p to after the character that caused us to exit the - previous loop, and end the string. */ - if ((! isspace (*p)) && *p != ',') - p ++; - *p = '\0'; - } - - g_free (*str); - *str = newstr; -} - -static char * -e_name_western_get_words_at_idx (char *str, int idx, int num_words) -{ - char *words; - char *p; - int word_count; - int words_len; - - /* - * Walk to the end of the words. - */ - word_count = 0; - p = str + idx; - while (word_count < num_words && *p != '\0') { - while (! isspace (*p) && *p != '\0') - p ++; - - while (isspace (*p) && *p != '\0') - p ++; - - word_count ++; - } - - words_len = p - str - idx - 1; - - if (*p == '\0') - words_len ++; - - words = g_malloc0 (1 + words_len); - strncpy (words, str + idx, words_len); - - return words; -} - -/* - * What the fuck is wrong with glib's MAX macro. - */ -static int -e_name_western_max (const int a, const int b) -{ - if (a > b) - return a; - - return b; -} - -static gboolean -e_name_western_word_is_suffix (char *word) -{ - int i; - - for (i = 0; e_name_western_sfx_table [i] != NULL; i ++) { - if (g_strcasecmp (word, e_name_western_sfx_table [i])) - continue; - - return TRUE; - } - - return FALSE; -} - -static char * -e_name_western_get_one_prefix_at_str (char *str) -{ - char *word; - int i; - - /* - * Check for prefixes from our table. - */ - for (i = 0; e_name_western_pfx_table [i] != NULL; i ++) { - int pfx_words; - char *words; - - pfx_words = e_name_western_str_count_words (e_name_western_pfx_table [i]); - words = e_name_western_get_words_at_idx (str, 0, pfx_words); - - if (! g_strcasecmp (words, e_name_western_pfx_table [i])) - return words; - - g_free (words); - } - - /* - * Check for prefixes we don't know about. These are always a - * sequence of more than one letters followed by a period. - */ - word = e_name_western_get_words_at_idx (str, 0, 1); - - if (strlen (word) > 2 && - isalpha ((unsigned char) word [0]) && - isalpha ((unsigned char) word [1]) && - word [strlen (word) - 1] == '.') - return word; - - g_free (word); - - return NULL; -} - -static char * -e_name_western_get_prefix_at_str (char *str) -{ - char *pfx; - char *pfx1; - char *pfx2; - char *p; - - /* Get the first prefix. */ - pfx1 = e_name_western_get_one_prefix_at_str (str); - - if (pfx1 == NULL) - return NULL; - - /* Check for a second prefix. */ - p = str + strlen (pfx1); - while (isspace (*p) && *p != '\0') - p ++; - - pfx2 = e_name_western_get_one_prefix_at_str (p); - - if (pfx2 != NULL) { - int pfx_len; - - pfx_len = (p + strlen (pfx2)) - str; - pfx = g_malloc0 (pfx_len + 1); - strncpy (pfx, str, pfx_len); - } else { - pfx = g_strdup (pfx1); - } - - g_free (pfx1); - g_free (pfx2); - - return pfx; -} - -static void -e_name_western_extract_prefix (ENameWestern *name, ENameWesternIdxs *idxs) -{ - char *pfx; - - pfx = e_name_western_get_prefix_at_str (name->full); - - if (pfx == NULL) - return; - - idxs->prefix_idx = 0; - name->prefix = pfx; -} - -static gboolean -e_name_western_is_complex_last_beginning (char *word) -{ - int i; - - for (i = 0; e_name_western_complex_last_table [i] != NULL; i ++) { - - if (! g_strcasecmp ( - word, e_name_western_complex_last_table [i])) - return TRUE; - } - - return FALSE; -} - -static void -e_name_western_extract_first (ENameWestern *name, ENameWesternIdxs *idxs) -{ - /* - * If there's a prefix, then the first name is right after it. - */ - if (idxs->prefix_idx != -1) { - int first_idx; - char *p; - - first_idx = idxs->prefix_idx + strlen (name->prefix); - - /* Skip past white space. */ - p = name->full + first_idx; - while (isspace (*p) && *p != '\0') - p++; - - if (*p == '\0') - return; - - idxs->first_idx = p - name->full; - name->first = e_name_western_get_words_at_idx ( - name->full, idxs->first_idx, 1); - - } else { - - /* - * Otherwise, the first name is probably the first string. - */ - idxs->first_idx = 0; - name->first = e_name_western_get_words_at_idx ( - name->full, idxs->first_idx, 1); - } - - /* - * Check that we didn't just assign the beginning of a - * compound last name to the first name. - */ - if (name->first != NULL) { - if (e_name_western_is_complex_last_beginning (name->first)) { - g_free (name->first); - name->first = NULL; - idxs->first_idx = -1; - } - } -} - -static void -e_name_western_extract_middle (ENameWestern *name, ENameWesternIdxs *idxs) -{ - char *word; - int middle_idx; - - /* - * Middle names can only exist if you have a first name. - */ - if (idxs->first_idx == -1) - return; - - middle_idx = idxs->first_idx + strlen (name->first) + 1; - - if (middle_idx > strlen (name->full)) - return; - - /* - * Search for the first space (or the terminating \0) - */ - while (isspace (name->full [middle_idx]) && - name->full [middle_idx] != '\0') - middle_idx ++; - - if (name->full [middle_idx] == '\0') - return; - - /* - * Skip past the nickname, if it's there. - */ - if (name->full [middle_idx] == '\"') { - if (idxs->nick_idx == -1) - return; - - middle_idx = idxs->nick_idx + strlen (name->nick) + 1; - - while (isspace (name->full [middle_idx]) && - name->full [middle_idx] != '\0') - middle_idx ++; - - if (name->full [middle_idx] == '\0') - return; - } - - /* - * Make sure this isn't the beginning of a complex last name. - */ - word = e_name_western_get_words_at_idx (name->full, middle_idx, 1); - if (e_name_western_is_complex_last_beginning (word)) { - g_free (word); - return; - } - - /* - * Make sure this isn't a suffix. - */ - e_name_western_cleanup_string (& word); - if (e_name_western_word_is_suffix (word)) { - g_free (word); - return; - } - - /* - * Make sure we didn't just grab a cute nickname. - */ - if (word [0] == '\"') { - g_free (word); - return; - } - - idxs->middle_idx = middle_idx; - name->middle = word; -} - -static void -e_name_western_extract_nickname (ENameWestern *name, ENameWesternIdxs *idxs) -{ - int idx; - int start_idx; - char *str; - - if (idxs->first_idx == -1) - return; - - if (idxs->middle_idx > idxs->first_idx) - idx = idxs->middle_idx + strlen (name->middle); - else - idx = idxs->first_idx + strlen (name->first); - - while (name->full [idx] != '\"' && name->full [idx] != '\0') - idx ++; - - if (name->full [idx] != '\"') - return; - - start_idx = idx; - - /* - * Advance to the next double quote. - */ - idx ++; - - while (name->full [idx] != '\"' && name->full [idx] != '\0') - idx ++; - - if (name->full [idx] == '\0') - return; - - str = g_malloc0 (idx - start_idx + 2); - strncpy (str, name->full + start_idx, idx - start_idx + 1); - - name->nick = str; - idxs->nick_idx = start_idx; -} - -static int -e_name_western_last_get_max_idx (ENameWestern *name, ENameWesternIdxs *idxs) -{ - int max_idx = -1; - - if (name->prefix != NULL) - max_idx = e_name_western_max ( - max_idx, idxs->prefix_idx + strlen (name->prefix)); - - if (name->first != NULL) - max_idx = e_name_western_max ( - max_idx, idxs->first_idx + strlen (name->first)); - - if (name->middle != NULL) - max_idx = e_name_western_max ( - max_idx, idxs->middle_idx + strlen (name->middle)); - - if (name->nick != NULL) - max_idx = e_name_western_max ( - max_idx, idxs->nick_idx + strlen (name->nick)); - - return max_idx; -} - -static void -e_name_western_extract_last (ENameWestern *name, ENameWesternIdxs *idxs) -{ - char *word; - int idx = -1; - - idx = e_name_western_last_get_max_idx (name, idxs); - - /* - * In the case where there is no preceding name element, the - * name is either just a first name ("Nat", "John"), is a - * single-element name ("Cher", which we treat as a first - * name), or is just a last name. The only time we can - * differentiate a last name alone from a single-element name - * or a first name alone is if it's a complex last name ("de - * Icaza", "van Josephsen"). So if there is no preceding name - * element, we check to see whether or not the first part of - * the name is the beginning of a complex name. If it is, - * we subsume the entire string. If we accidentally subsume - * the suffix, this will get fixed in the fixup routine. - */ - if (idx == -1) { - word = e_name_western_get_words_at_idx (name->full, 0, 1); - if (! e_name_western_is_complex_last_beginning (word)) { - g_free (word); - return; - } - - name->last = g_strdup (name->full); - idxs->last_idx = 0; - return; - } - - /* Skip past the white space. */ - while (isspace (name->full [idx]) && name->full [idx] != '\0') - idx ++; - - if (name->full [idx] == '\0') - return; - - word = e_name_western_get_words_at_idx (name->full, idx, 1); - e_name_western_cleanup_string (& word); - if (e_name_western_word_is_suffix (word)) { - g_free (word); - return; - } - g_free (word); - - /* - * Subsume the rest of the string into the last name. If we - * accidentally include the prefix, it will get fixed later. - * This is the only way to handle things like "Miguel de Icaza - * Amozorrutia" without dropping data and forcing the user - * to retype it. - */ - name->last = g_strdup (name->full + idx); - idxs->last_idx = idx; -} - -static char * -e_name_western_get_preceding_word (char *str, int idx) -{ - int word_len; - char *word; - char *p; - - p = str + idx; - - while (isspace (*p) && p > str) - p --; - - while (! isspace (*p) && p > str) - p --; - - if (isspace (*p)) - p ++; - - word_len = (str + idx) - p; - word = g_malloc0 (word_len + 1); - if (word_len > 0) - strncpy (word, p, word_len); - - return word; -} - -static char * -e_name_western_get_suffix_at_str_end (char *str) -{ - char *suffix; - char *p; - - /* - * Walk backwards till we reach the beginning of the - * (potentially-comma-separated) list of suffixes. - */ - p = str + strlen (str); - while (1) { - char *nextp; - char *word; - - word = e_name_western_get_preceding_word (str, p - str); - nextp = p - strlen (word) - 1; - - e_name_western_cleanup_string (& word); - - if (e_name_western_word_is_suffix (word)) { - p = nextp; - g_free (word); - } else { - g_free (word); - break; - } - } - - if (p == (str + strlen (str))) - return NULL; - - suffix = g_strdup (p); - e_name_western_cleanup_string (& suffix); - - if (strlen (suffix) == 0) { - g_free (suffix); - return NULL; - } - - return suffix; -} - -static void -e_name_western_extract_suffix (ENameWestern *name, ENameWesternIdxs *idxs) -{ - - name->suffix = e_name_western_get_suffix_at_str_end (name->full); - - if (name->suffix == NULL) - return; - - idxs->suffix_idx = strlen (name->full) - strlen (name->suffix); -} - -static gboolean -e_name_western_detect_backwards (ENameWestern *name, ENameWesternIdxs *idxs) -{ - char *comma; - char *word; - - comma = strchr (name->full, ','); - - if (comma == NULL) - return FALSE; - - /* - * If there's a comma, we need to detect whether it's - * separating the last name from the first or just separating - * suffixes. So we grab the word which comes before the - * comma and check if it's a suffix. - */ - word = e_name_western_get_preceding_word (name->full, comma - name->full); - - if (e_name_western_word_is_suffix (word)) { - g_free (word); - return FALSE; - } - - g_free (word); - return TRUE; -} - -static void -e_name_western_reorder_asshole (ENameWestern *name, ENameWesternIdxs *idxs) -{ - char *prefix; - char *last; - char *suffix; - char *firstmidnick; - char *newfull; - - char *comma; - char *p; - - if (! e_name_western_detect_backwards (name, idxs)) - return; - - /* - * Convert - * <Prefix> <Last name>, <First name> <Middle[+nick] name> <Suffix> - * to - * <Prefix> <First name> <Middle[+nick] name> <Last name> <Suffix> - */ - - /* - * Grab the prefix from the beginning. - */ - prefix = e_name_western_get_prefix_at_str (name->full); - - /* - * Everything from the end of the prefix to the comma is the - * last name. - */ - comma = strchr (name->full, ','); - if (comma == NULL) - return; - - p = name->full + (prefix == NULL ? 0 : strlen (prefix)); - - while (isspace (*p) && *p != '\0') - p ++; - - last = g_malloc0 (comma - p + 1); - strncpy (last, p, comma - p); - - /* - * Get the suffix off the end. - */ - suffix = e_name_western_get_suffix_at_str_end (name->full); - - /* - * Firstmidnick is everything from the comma to the beginning - * of the suffix. - */ - p = comma + 1; - - while (isspace (*p) && *p != '\0') - p ++; - - if (suffix != NULL) { - char *q; - - /* - * Point q at the beginning of the suffix. - */ - q = name->full + strlen (name->full) - strlen (suffix) - 1; - - /* - * Walk backwards until we hit the space which - * separates the suffix from firstmidnick. - */ - while (! isspace (*q) && q > comma) - q --; - - if ((q - p + 1) > 0) { - firstmidnick = g_malloc0 (q - p + 1); - strncpy (firstmidnick, p, q - p); - } else - firstmidnick = NULL; - } else { - firstmidnick = g_strdup (p); - } - - /* - * Create our new reordered version of the name. - */ -#define NULLSTR(a) ((a) == NULL ? "" : (a)) - newfull = g_strdup_printf ("%s %s %s %s", NULLSTR (prefix), NULLSTR (firstmidnick), - NULLSTR (last), NULLSTR (suffix)); - g_strstrip (newfull); - g_free (name->full); - name->full = newfull; - - - g_free (prefix); - g_free (firstmidnick); - g_free (last); - g_free (suffix); -} - -static void -e_name_western_zap_nil (char **str, int *idx) -{ - if (*str == NULL) - return; - - if (strlen (*str) != 0) - return; - - *idx = -1; - g_free (*str); - *str = NULL; -} - -#define FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ - char *last_start = NULL; \ - if (name->last) \ - last_start = strchr (name->last, ' '); \ - if (last_start) { \ - char *new_last, *new_first; \ - \ - new_last = g_strdup (last_start + 1); \ - *last_start = '\0'; \ - \ - idxs->last_idx += (last_start - name->last) + 1; \ - \ - new_first = g_strdup_printf ("%s %s %s", name->first, name->middle, name->last); \ - \ - g_free (name->first); \ - g_free (name->middle); \ - g_free (name->last); \ - \ - name->first = new_first; \ - name->middle = NULL; \ - name->last = new_last; \ - \ - idxs->middle_idx = -1; \ - } else { \ - char *new_first; \ - \ - new_first = g_strdup_printf ("%s %s %s", name->first, name->middle, name->last); \ - \ - g_free (name->first); \ - g_free (name->middle); \ - g_free (name->last); \ - \ - name->first = new_first; \ - name->middle = NULL; \ - name->last = NULL; \ - idxs->middle_idx = -1; \ - idxs->last_idx = -1; \ - } - -#define CHECK_MIDDLE_NAME_FOR_CONJUNCTION(conj) \ - if (idxs->middle_idx != -1 && !strcmp (name->middle, conj)) { \ - FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ - } - -#define CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE(conj) \ - if (idxs->middle_idx != -1 && !strcasecmp (name->middle, conj)) { \ - FINISH_CHECK_MIDDLE_NAME_FOR_CONJUNCTION \ - } - -static void -e_name_western_fixup (ENameWestern *name, ENameWesternIdxs *idxs) -{ - /* - * The middle and last names cannot be the same. - */ - if (idxs->middle_idx != -1 && idxs->middle_idx == idxs->last_idx) { - idxs->middle_idx = -1; - g_free (name->middle); - name->middle = NULL; - } - - /* - * If we have a middle name and no last name, then we mistook - * the last name for the middle name. - */ - if (idxs->last_idx == -1 && idxs->middle_idx != -1) { - idxs->last_idx = idxs->middle_idx; - name->last = name->middle; - name->middle = NULL; - idxs->middle_idx = -1; - } - - /* - * Check to see if we accidentally included the suffix in the - * last name. - */ - if (idxs->suffix_idx != -1 && idxs->last_idx != -1 && - idxs->suffix_idx < (idxs->last_idx + strlen (name->last))) { - char *sfx; - - sfx = name->last + (idxs->suffix_idx - idxs->last_idx); - if (sfx != NULL) { - char *newlast; - char *p; - - p = sfx - 1; - while (isspace (*p) && p > name->last) - p --; - p ++; - - newlast = g_malloc0 (p - name->last + 1); - strncpy (newlast, name->last, p - name->last); - g_free (name->last); - name->last = newlast; - } - } - - /* - * If we have a prefix and a first name, but no last name, - * then we need to assign the first name to the last name. - * This way we get things like "Mr Friedman" correctly. - */ - if (idxs->first_idx != -1 && idxs->prefix_idx != -1 && - idxs->last_idx == -1) { - name->last = name->first; - idxs->last_idx = idxs->first_idx; - idxs->first_idx = -1; - name->first = NULL; - } - - if (idxs->middle_idx != -1) { - CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("&"); - CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("*"); - CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("|"); - CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("^"); - CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("&&"); - CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("||"); - CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("+"); - CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("-"); - CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("and"); - CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("or"); - CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("plus"); - - /* Spanish */ - CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("y"); - - /* German */ - CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("und"); - - /* Italian */ - CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("e"); - - /* Czech */ - CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("a"); - - /* Finnish */ - CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("ja"); - - /* French */ - CHECK_MIDDLE_NAME_FOR_CONJUNCTION_CASE ("et"); - - /* Russian */ - CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("\xd0\x98"); /* u+0418 */ - CHECK_MIDDLE_NAME_FOR_CONJUNCTION ("\xd0\xb8"); /* u+0438 */ - } - - /* - * Remove stray spaces and commas (although there don't seem - * to be any in the test cases, they might show up later). - */ - e_name_western_cleanup_string (& name->prefix); - e_name_western_cleanup_string (& name->first); - e_name_western_cleanup_string (& name->middle); - e_name_western_cleanup_string (& name->nick); - e_name_western_cleanup_string (& name->last); - e_name_western_cleanup_string (& name->suffix); - - /* - * Make zero-length strings just NULL. - */ - e_name_western_zap_nil (& name->prefix, & idxs->prefix_idx); - e_name_western_zap_nil (& name->first, & idxs->first_idx); - e_name_western_zap_nil (& name->middle, & idxs->middle_idx); - e_name_western_zap_nil (& name->nick, & idxs->nick_idx); - e_name_western_zap_nil (& name->last, & idxs->last_idx); - e_name_western_zap_nil (& name->suffix, & idxs->suffix_idx); -} - -/** - * e_name_western_western_parse_fullname: - * @full_name: A string containing a Western name. - * - * Parses @full_name and returns an #ENameWestern object filled with - * the component parts of the name. - */ -ENameWestern * -e_name_western_parse (const char *full_name) -{ - ENameWesternIdxs *idxs; - ENameWestern *wname; - - wname = g_new0 (ENameWestern, 1); - - wname->full = g_strdup (full_name); - - idxs = g_new0 (ENameWesternIdxs, 1); - - idxs->prefix_idx = -1; - idxs->first_idx = -1; - idxs->middle_idx = -1; - idxs->nick_idx = -1; - idxs->last_idx = -1; - idxs->suffix_idx = -1; - - /* - * An extremely simple algorithm. - * - * The goal here is to get it right 95% of the time for - * Western names. - * - * First we check to see if this is an ass-backwards name - * ("Prefix Last, First Middle Suffix"). These names really - * suck (imagine "Dr von Johnson, Albert Roderick Jr"), so - * we reorder them first and then parse them. - * - * Next, we grab the most obvious assignments for the various - * parts of the name. Once this is done, we check for stupid - * errors and fix them up. - */ - e_name_western_reorder_asshole (wname, idxs); - - e_name_western_extract_prefix (wname, idxs); - e_name_western_extract_first (wname, idxs); - e_name_western_extract_nickname (wname, idxs); - e_name_western_extract_middle (wname, idxs); - e_name_western_extract_last (wname, idxs); - e_name_western_extract_suffix (wname, idxs); - - e_name_western_fixup (wname, idxs); - - g_free (idxs); - - return wname; -} - -/** - * e_name_western_free: - * @name: An ENameWestern object which needs to be freed. - * - * Deep-frees @name - */ -void -e_name_western_free (ENameWestern *w) -{ - - g_free (w->prefix); - g_free (w->first); - g_free (w->middle); - g_free (w->nick); - g_free (w->last); - g_free (w->suffix); - - g_free (w->full); - - g_free (w); -} |