I've done the attached patch for the pico of pine-4.61. It's solely my work, based on some of pine's existing code in this area. No line from any foreign source is included. This is an submission of the attached patch under the condition outlined in the pine legal notice: http://www.washington.edu/pine/overview/legal.html "Submission of these patches to University of Washington for possible inclusion in future Pine versions is also encouraged, with the understanding that they would be treated the same as all other Pine code in terms of licensing." I therefore encourage the University of Washington to look at the patch and consider inclusion in future Pine versions as outlined above under the licensing terms of pine. This patch does not remove code or features, only depends on features provided by the Single UNIX specification, version 2. This reference to the Single UNIX specification only applies to the detection of an UTF-8 locale, and for operating systems which don't support it this can be moved to OS-specific files. The only lines of code in this patch which is not my origninal code are the UTF-8 decoding and encoding routines, which are reused versions of the same decoding and encoding routines which existed in other parts of pine 4.61 already. Run thru the patch: * Extend the character storage in stuct CELL to 32-bit for UCS-4 * Extend the kill and justify buffers the same way * Fix places where buffer data was unneccesary narrowed to 8-bit * replace the obsolete bit "P_LOCALLF" and use it as "P_UNICODE" for selecting wether to store 8-bit characters for 32-bit UCS-4 data in struct CELL, the storage container pico for characters. * change's setlocale_ctype default from 0 to 1 (default on) (add the option -no_setlocale_ctype to disable setlocale) * call nl_langinfo(CODESET) to check for an UTF-8 charset (this is only possible and done when setlocale(LC_CTYPE, "") is enabled) * create a wrapper for linsert which collects UTF-8 bytes on each call and if it finds a valid UTF-8 character, it converts it to UCS-4 and passes it to linsert for storing the incoming data as one UCS-4 code in a CELL * change the places where data is received from outside sources to use the linsert wrapper instead of linsert. * extend ttputc() to check for UCS-4 codes and output the corresponding UTF-8 sequence to the terminal. * extend ffputline to do the same of files output. * extend the search function to convert the search pattern to UCS-4 and apply the search by maching the UCS-4 code if pico is in Unicode mode * In forwsearch, use the easyest method to find successive occurences of the pattern because the complicated method does not work in Unicode(UTF-8/UCS-4) * extend the byte-oriented pico buffer read/write interface for using pico as internal editor of pine to decode UTF-8 to UCS-4 cells on write and encode UCS-4 cells to UTF-8 on read. This allows pine 4.61 to use libpico as internal editor on terminals using UTF-8 as charset. * To make this work, the patch extends pine to also look for an locate setting which uses UTF-8 at character-set and pass the information to activate Unicode mode to libpico. This set of changes, used on on Linux glibc-2.2 (or higher), results in a functional pico (stand-alone and inside pine) which, depending if the current locale, uses UTF-8 as charset (for the use in pine: or if UTF-8 is configured as character-set) for reads and writes to/from files and folders (compose and postpone of messages) as well as the terminal. Bernhard Kaindl diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pico/estruct.h pine4.63/pico/estruct.h --- pine4.63-patch11/pico/estruct.h 2004-11-30 17:39:55.000000000 -0700 +++ pine4.63/pico/estruct.h 2005-06-01 07:55:36.000000000 -0600 @@ -290,7 +290,7 @@ typedef struct { * and short if there are problems... */ typedef struct CELL { - unsigned int c : 8; /* Character value in cell */ + unsigned int c : 32; /* Character value in cell */ unsigned int a : 8; /* Its attributes */ } CELL; diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pico/file.c pine4.63/pico/file.c --- pine4.63-patch11/pico/file.c 2004-06-11 15:49:48.000000000 -0600 +++ pine4.63/pico/file.c 2005-06-01 07:55:36.000000000 -0600 @@ -425,11 +425,11 @@ insmsgchar(c) for(p = (glo_quote_str ? glo_quote_str : (Pmaster ? Pmaster->quote_str : NULL)); p && *p; p++) - if(!linsert(1, *p)) + if(!linsert_byte(1, *p)) return(0); } else if(c != '\r') /* ignore CR (likely CR of CRLF) */ - return(linsert(1, c)); + return(linsert_byte(1, c)); return(1); } @@ -493,7 +493,7 @@ int rename; /* don't rename case FIOLNG : for(linep = line; charsread-- > 0; linep++) - linsert(1, (unsigned char) *linep); + linsert_byte(1, *linep); break; @@ -910,7 +910,7 @@ char fname[]; case FIOLNG : for(linep = line; charsread-- > 0; linep++) - linsert(1, (unsigned char) *linep); + linsert_byte(1, *linep); break; diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pico/fileio.c pine4.63/pico/fileio.c --- pine4.63-patch11/pico/fileio.c 2002-02-12 15:54:21.000000000 -0700 +++ pine4.63/pico/fileio.c 2005-06-01 07:55:36.000000000 -0600 @@ -71,9 +71,20 @@ ffputline(buf, nbuf) { register int i; - for (i = 0; i < nbuf; ++i) + for (i = 0; i < nbuf; ++i) { + if (gmode & P_UNICODE && buf[i].c & 0xffffff80) { + if (buf[i].c & 0xf800) { + fputc(0xe0 | (buf[i].c >> 12), g_pico_fio.fp); + fputc(0x80 | ((buf[i].c >> 6) & 0x3f), g_pico_fio.fp); + } + else + fputc(0xc0 | ((buf[i].c >> 6) & 0x3f), g_pico_fio.fp); + if (fputc(0x80 | (buf[i].c & 0x3f), g_pico_fio.fp) == EOF) + break; + } else if(fputc(buf[i].c&0xFF, g_pico_fio.fp) == EOF) break; + } if(i == nbuf) fputc('\n', g_pico_fio.fp); diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pico/line.c pine4.63/pico/line.c --- pine4.63-patch11/pico/line.c 2004-05-07 15:41:16.000000000 -0600 +++ pine4.63/pico/line.c 2005-06-01 07:55:36.000000000 -0600 @@ -51,7 +51,7 @@ static char rcsid[] = "$Id: line.c,v 4.3 */ struct pkchunk { short used; /* # of bytes used in this buffer*/ - char bufp[KBLOCK]; /* buffer containing text */ + int bufp[KBLOCK]; /* buffer containing text */ struct pkchunk *next; /* pointer to next chunk */ }; @@ -210,6 +210,83 @@ int f, n; backchar(f, n); } +/* Return UCS-4 character from UTF-8 string + * (Based on code from pine-4.61/imap/src/c-client/utf8.c) + * Accepts: pointer to string, remaining octets in string + * Returns: UCS-4 character or negative if error + */ +unsigned int utf8_get_ucs_string(unsigned char **s, unsigned int i) +{ + unsigned char c; + unsigned int ret = 0; + int more = 0; + while (i--) { + if (((c = *(*s)++) > 0x7f) && (c < 0xc0)) { /* UTF-8 continuation? */ + if (!more) /* continuation outside of UTF-8 sequence? */ + return '?'; /* bad sequence, put replacement character */ + ret <<= 6; + ret |= c & 0x3f; + if (!--more) /* last octet reached? */ + return ret; /* return UTC-4 code */ + } + else if (more) /* in sequence, but not a continuation byte */ + return '?'; /* bad sequence, put replacement character */ + else if (c < 0x80) /* U+0000 - U+007f */ + return c; + else if (c < 0xe0) { /* U+0080 - U+07ff */ + ret = c & 0x1f; /* first 5 bits of 12 */ + more = 1; + } + else if (c < 0xf0) { /* U+1000 - U+ffff */ + ret = c & 0x0f; /* first 4 bits of 16 */ + more = 2; + } /* non-BMP Unicode */ + else if (c < 0xf8) { /* U+10000 - U+10ffff (U+1fffff) */ + ret = c & 0x07; /* first 3 bits of 20.5 (21) */ + more = 3; + } + else if (c < 0xfc) { /* ISO 10646 200000 - 3fffffff */ + ret = c & 0x03; /* first 2 bits of 26 */ + more = 4; + } + else if (c < 0xfe) { /* ISO 10646 4000000-7fffffff */ + ret = c & 0x01; /* first bit of 31 */ + more = 5; + } else + return '?'; /* not in ISO 10646 -> replacement character */ + } /* end of input, but sequnece not complete */ + return 0; +} +unsigned int utf8_get_ucs(unsigned char *s, unsigned int i) +{ + unsigned char *l = s; + return utf8_get_ucs_string(&l, i); +} + +/* + * Insert "n" copies of the character "c" at the current location of dot. + * The real work is done by linsert(). This is a wrapper does: + * In UTF-8 mode, decode byte sequencies and if a sequence is complete, + * insert the resulting Unicode(UCS4) value as cell value into the buffer. + */ +int linsert_byte(n, c) +unsigned int n, c; +{ + static char linsert_buf[6], linsert_buf_count = 0; + if (n == 1 && gmode & P_UNICODE && c & 0x80) { + if (linsert_buf_count >= sizeof(linsert_buf)) + linsert_buf_count = 0; + linsert_buf[linsert_buf_count++] = c; + c = 0; + if (linsert_buf_count > 1) + c = utf8_get_ucs(linsert_buf, linsert_buf_count); + if (!c) + return 1; + } + linsert_buf_count = 0; + return linsert(n, c); +} + /* * Insert "n" copies of the character "c" at the current location of dot. In * the easy case all that happens is the text is stored in the line. In the @@ -294,7 +371,7 @@ long *lines; lp1->l_bp = lp2; *doto = n; *dotp = lp1; - ac.c = ((char)c & 0xff); + ac.c = c; cp1 = &(*dotp)->l_text[0]; while(n--) *cp1++ = ac; @@ -342,7 +419,7 @@ long *lines; *--cp2 = *--cp1; } - ac.c = ((char)c & 0xff); + ac.c = c; while(n--) /* add the chars */ (*dotp)->l_text[(*doto)++] = ac; @@ -767,7 +844,7 @@ pkbufremove(n, buf) if(!(p = p->next)) return(-1); - return(p->bufp[n % KBLOCK] & 0xff); + return(p->bufp[n % KBLOCK]); } else return(-1); diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pico/main.c pine4.63/pico/main.c --- pine4.63-patch11/pico/main.c 2004-05-07 15:41:16.000000000 -0600 +++ pine4.63/pico/main.c 2005-06-01 07:55:36.000000000 -0600 @@ -119,7 +119,6 @@ char *args_pico_args[] = { #endif "\t +[line#] \tLine - start on line# line, default=1", "\t -v \t\tView - view file", -"\t -setlocale_ctype\tdo setlocale(LC_CTYPE) if available", "\t -no_setlocale_collate\tdo not do setlocale(LC_COLLATE)", "\t -version\tPico version number", "", @@ -146,7 +145,7 @@ char *argv[]; int viewflag = FALSE; /* are we starting in view mode?*/ int starton = 0; /* where's dot to begin with? */ int setlocale_collate = 1; - int setlocale_ctype = 0; + int setlocale_ctype = 1; /* if a problem shows up, should be fixed */ char bname[NBUFN]; /* buffer name of file to read */ char *file_to_edit = NULL; diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pico/osdep/unix pine4.63/pico/osdep/unix --- pine4.63-patch11/pico/osdep/unix 2005-04-19 15:28:56.000000000 -0600 +++ pine4.63/pico/osdep/unix 2005-06-01 07:55:37.000000000 -0600 @@ -1,3 +1,7 @@ +#ifdef LC_CTYPE +#include +#endif + int timeo = 0; time_t time_of_last_input; int (*pcollator)(); @@ -221,6 +225,15 @@ ttgetwinsz(row, col) */ ttputc(c) { + if (gmode & P_UNICODE && c > 127) { + if (c & 0xf800) { + putc(0xe0 | (c >> 12), stdout); + putc(0x80 | ((c >> 6) & 0x3f), stdout); + } + else + putc(0xc0 | ((c >> 6) & 0x3f), stdout); + return putc(0x80 | (c & 0x3f), stdout); + } return(putc(c, stdout)); } @@ -3871,6 +3884,9 @@ set_collation(collation, ctype) #ifdef LC_CTYPE if(ctype){ (void)setlocale(LC_CTYPE, ""); + /* For reference, see: "The Single UNIX(R) Specification, Version 2" */ + if (!strcmp(nl_langinfo(CODESET), "UTF-8")) + gmode ^= P_UNICODE; } #endif } diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pico/pico.c pine4.63/pico/pico.c --- pine4.63-patch11/pico/pico.c 2005-03-31 10:08:57.000000000 -0700 +++ pine4.63/pico/pico.c 2005-06-01 07:55:37.000000000 -0600 @@ -540,7 +540,7 @@ int c, f, n; /* do the appropriate insertion */ /* pico never does C mode, this is simple */ - status = linsert(n, c); + status = linsert_byte(n, c); /* * Check to make sure we didn't go off of the screen @@ -1451,6 +1451,7 @@ typedef struct picotext { LINE *dotp; int doto; short crinread; + char readch[6]; } PICOTEXT; #define PT(X) ((PICOTEXT *)(X)) @@ -1504,6 +1505,8 @@ void *w; * pico_readc - return char at current point. Up to calling routines * to keep cumulative count of chars. */ +#define PUTC(w, c) PT(w)->readch[PT(w)->crinread++] = c; +#define GETC(w) PT(w)->readch[--PT(w)->crinread]; int pico_readc(w, c) void *w; @@ -1512,12 +1515,20 @@ unsigned char *c; int rv = 0; if(PT(w)->crinread){ - *c = '\012'; /* return LF */ - PT(w)->crinread = 0; + *c = GETC(w); rv++; } else if(PT(w)->doto < llength(PT(w)->dotp)){ /* normal char to return */ - *c = (unsigned char) lgetc(PT(w)->dotp, (PT(w)->doto)++).c; + int ch = lgetc(PT(w)->dotp, (PT(w)->doto)++).c; + if (gmode & P_UNICODE && ch & 0xff80) { + PUTC(w, 0x80 | (ch & 0x3f)) + if (ch & 0xf800) { /* three byte code */ + *c = 0xe0 | (ch >> 12); + PUTC(w, 0x80 | ((ch >> 6) & 0x3f)) + } else + *c = 0xc0 | ((ch >> 6) & 0x3f); + } else + *c = ch; rv++; } else if(PT(w)->dotp != PT(w)->linep){ /* return line break */ @@ -1525,7 +1536,7 @@ unsigned char *c; PT(w)->doto = 0; #if defined(DOS) || defined(OS2) *c = '\015'; - PT(w)->crinread++; + PUTC(w, '\012') #else *c = '\012'; /* return local eol! */ #endif @@ -1584,8 +1595,20 @@ int c; rv++; } - else + else { + if (gmode & P_UNICODE && c & 0xffffff80) { + if (PT(w)->crinread >= 6) + PT(w)->crinread = 0; + PUTC(w, c); + c = 0; + if (PT(w)->crinread > 1) + c = utf8_get_ucs(PT(w)->readch, PT(w)->crinread); + if (!c) + return 1; + } + PT(w)->crinread = 0; rv = geninsert(&PT(w)->dotp, &PT(w)->doto, PT(w)->linep, c, 0, 1, NULL); + } return((rv) ? 1 : 0); /* return number written */ } diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pico/pico.h pine4.63/pico/pico.h --- pine4.63-patch11/pico/pico.h 2005-03-30 15:44:42.000000000 -0700 +++ pine4.63/pico/pico.h 2005-06-01 07:55:37.000000000 -0600 @@ -356,7 +356,7 @@ extern void kbdestroy PROTO((KBESC_T *)) #define P_HICTRL 0x80000000 /* overwrite mode */ #define P_CHKPTNOW 0x40000000 /* do the checkpoint on entry */ #define P_DELRUBS 0x20000000 /* map ^H to forwdel */ -#define P_LOCALLF 0x10000000 /* use local vs. NVT EOL */ +#define P_UNICODE 0x10000000 /* run in Unicode mode */ #define P_BODY 0x08000000 /* start composer in body */ #define P_HEADEND 0x04000000 /* start composer at end of header */ #define P_VIEW MDVIEW /* read-only */ diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pico/search.c pine4.63/pico/search.c --- pine4.63-patch11/pico/search.c 2004-07-01 15:36:36.000000000 -0600 +++ pine4.63/pico/search.c 2005-06-01 07:55:37.000000000 -0600 @@ -113,6 +113,8 @@ int pc; forwsearch(f, n) int f, n; { + LINE *lastline; /* line position before scan */ + int lastoff; /* last position within line */ register int status; int wrapt = FALSE, wrapt2 = FALSE; int repl_mode = FALSE; @@ -259,26 +261,15 @@ forwsearch(f, n) } } + lastline = curwp->w_dotp; /* line position before scan */ + lastoff = curwp->w_doto; /* last position within line */ + /* - * This code is kind of dumb. What I want is successive C-W 's to - * move dot to successive occurences of the pattern. So, if dot is - * already sitting at the beginning of the pattern, then we'll move - * forward a char before beginning the search. We'll let the - * automatic wrapping handle putting the dot back in the right - * place... + * Successive C-W 's should move the dot to successive occurences + * of the pattern. So move the dot forward one char before the search + * and if the seach fails, put it back were it was. */ - status = 0; /* using "status" as int temporarily! */ - while(1){ - if(defpat[status] == '\0'){ - forwchar(0, 1); - break; /* find next occurence! */ - } - - if(status + curwp->w_doto >= llength(curwp->w_dotp) || - !eq((unsigned char)defpat[status],lgetc(curwp->w_dotp, curwp->w_doto + status).c)) - break; /* do nothing! */ - status++; - } + forwchar(0, 1); /* search for the pattern */ @@ -290,6 +281,8 @@ forwsearch(f, n) /* and complain if not there */ if (status == FALSE){ emlwrite("\"%s\" not found", defpat); + curwp->w_dotp = lastline; /* line position before scan */ + curwp->w_doto = lastoff; /* last position within line */ } else if((gmode & MDREPLACE) && repl_mode == TRUE){ status = replace_pat(defpat, &wrapt2); /* replace pattern */ @@ -705,9 +698,19 @@ int leavep; /* place to leave point register int c; /* character at current position */ register LINE *matchline; /* current line during matching */ register int matchoff; /* position in matching line */ - register char *patptr; /* pointer into pattern */ + register char *patptr = patrn; /* pointer into pattern */ + unsigned char *tmp; register int stopoff; /* offset to stop search */ register LINE *stopline; /* line to stop search */ + unsigned int ucspat[NPAT], ucspos = 0, match; + + /* In Unicode mode, we've to create an UCS-4 vector from the pattern: */ + while (gmode & P_UNICODE && *patptr != 0 && ucspos < NPAT) { + tmp = patptr; + ucspat[ucspos++] = utf8_get_ucs_string(&tmp, strlen(patptr)); + patptr = tmp; + } + ucspat[ucspos] = 0; /* terminate the int vector with a zero int */ *wrapt = FALSE; @@ -759,15 +762,23 @@ int leavep; /* place to leave point else c = lgetc(curline, curoff++).c; /* get the char */ + if (gmode & P_UNICODE) { + match = ucspat[ucspos=0]; + } else + match = patrn[0]; + /* test it against first char in pattern */ - if (eq(c, (unsigned char)patrn[0]) != FALSE) { /* if we find it..*/ + if (eq(c, match) != FALSE) { /* if we find it..*/ /* setup match pointers */ matchline = curline; matchoff = curoff; patptr = &patrn[0]; /* scan through patrn for a match */ - while (*++patptr != 0) { + while (1) { + if (!(match = *++patptr) || (gmode & P_UNICODE && + !(match = ucspat[++ucspos]))) + break; /* advance all the pointers */ if (matchoff == llength(matchline)) { /* advance past EOL */ @@ -781,7 +792,7 @@ int leavep; /* place to leave point return(FALSE); /* and test it against the pattern */ - if (eq((unsigned char) *patptr, c) == FALSE) + if (eq(match, c) == FALSE) goto fail; } diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pine/init.c pine4.63/pine/init.c --- pine4.63-patch11/pine/init.c 2005-04-07 12:01:42.000000000 -0600 +++ pine4.63/pine/init.c 2005-06-01 07:55:37.000000000 -0600 @@ -66,6 +66,9 @@ static char rcsid[] = "$Id: init.c,v 4.7 #include "headers.h" #include "../c-client/imap4r1.h" /* for LEVELSTATUS() */ +#ifdef LC_CTYPE +# include +#endif typedef enum {Sapling, Seedling, Seasoned} FeatureLevel; @@ -1484,6 +1487,15 @@ init_vars(ps) /*--- The defaults here are defined in os-xxx.h so they can vary per machine ---*/ +#ifdef LC_CTYPE + setlocale(LC_CTYPE, ""); /* needed for using nl_langinfo */ + GLO_CHAR_SET = cpystr(nl_langinfo(CODESET)); + /* if codeset indicates that we are in an US-ASCII locale, */ + if (!strcmp(GLO_CHAR_SET, "ANSI_X3.4-1968")) { + fs_give((void **) &(GLO_CHAR_SET)); + cpystr("US-ASCII"); /* default to US-ASCII */ + } +#endif GLO_PRINTER = cpystr(DF_DEFAULT_PRINTER); GLO_ELM_STYLE_SAVE = cpystr(DF_ELM_STYLE_SAVE); @@ -2111,6 +2123,9 @@ init_vars(ps) set_current_val(&vars[V_OLD_STYLE_REPLY], TRUE, TRUE); obs_old_style_reply = !strucmp(VAR_OLD_STYLE_REPLY, "yes"); + /* needed in process_feature_list */ + set_current_val(&vars[V_CHAR_SET], TRUE, TRUE); + set_feature_list_current_val(&vars[V_FEATURE_LIST]); process_feature_list(ps, VAR_FEATURE_LIST, (obs_feature_level == Seasoned) ? 1 : 0, @@ -3103,6 +3118,10 @@ process_feature_list(ps, list, old_growt if(old_growth) set_old_growth_bits(ps, 0); + if(ps_global->VAR_CHAR_SET + && !strucmp(ps_global->VAR_CHAR_SET, "UTF-8")) + F_TURN_ON(F_QUELL_CHARSET_WARNING, ps_global); /* if not user-off */ + /* now run through the list (global, user, and cmd_line lists are here) */ if(list){ for(p = list; (q = *p) != NULL; p++){ @@ -3154,6 +3173,11 @@ process_feature_list(ps, list, old_growt #ifdef _WINDOWS ps->pass_ctrl_chars = 1; #else + if(ps_global->VAR_CHAR_SET + && !strucmp(ps_global->VAR_CHAR_SET, "UTF-8")) { + F_TURN_ON(F_PASS_C1_CONTROL_CHARS, ps_global); /* global see below */ + F_TURN_ON(F_ENABLE_SETLOCALE_CTYPE, ps); /* for setting gmode */ + } ps->pass_ctrl_chars = F_ON(F_PASS_CONTROL_CHARS,ps_global) ? 1 : 0; ps->pass_c1_ctrl_chars = F_ON(F_PASS_C1_CONTROL_CHARS,ps_global) ? 1 : 0; diff -pruN -x '*~' -x '*.rej' -x '*.orig' pine4.63-patch11/pine/reply.c pine4.63/pine/reply.c --- pine4.63-patch11/pine/reply.c 2005-04-28 11:22:03.000000000 -0600 +++ pine4.63/pine/reply.c 2005-06-01 07:55:37.000000000 -0600 @@ -6356,6 +6356,9 @@ standard_picobuf_setup(pbf) && ps_global->VAR_EDITOR[0] && ps_global->VAR_EDITOR[0][0])) ? P_ADVANCED : 0L) + | ((ps_global->VAR_CHAR_SET + && !strucmp(ps_global->VAR_CHAR_SET, "UTF-8")) + ? P_UNICODE : 0L) | ((!ps_global->VAR_CHAR_SET || !strucmp(ps_global->VAR_CHAR_SET, "US-ASCII")) ? P_HIBITIGN : 0L));