Re: UTF-8

Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
From: Alexander Polakov
Subject: Re: UTF-8
Date: Saturday, August 7, 2010 - 7:52 am

* Jordi Beltran Creix <jbcreix.mail@gmail.com> [100804 17:26]:

I made a patch for ls based on NetBSD (see below), works just fine for me.


One can use mksh instead (or better backport their utf8-handling code).


wcwidth() is in, but no man page yet.


--- ls.c	2010/08/07 15:15:04	1.1
+++ ls.c	2010/08/07 15:17:32
@@ -41,6 +41,7 @@
 #include <err.h>
 #include <errno.h>
 #include <fts.h>
+#include <locale.h>
 #include <grp.h>
 #include <pwd.h>
 #include <stdio.h>
@@ -102,6 +103,7 @@
 	int kflag = 0;
 	char *p;

+	setlocale(LC_CTYPE, "");
 	/* Terminal defaults to -Cq, non-terminal defaults to -1. */
 	if (isatty(STDOUT_FILENO)) {
 		if ((p = getenv("COLUMNS")) != NULL)
--- util.c	2010/08/07 15:00:48	1.1
+++ util.c	2010/08/07 15:13:52
@@ -41,18 +41,75 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <wchar.h>

 #include "ls.h"
 #include "extern.h"

+#define MB_LEN_MAX 32 /* goes into limits.h */
+
+static int
+printwc(wchar_t wc, mbstate_t * pst)
+{
+	size_t		size;
+	char		buf[MB_LEN_MAX];
+
+	size = wcrtomb(buf, wc, pst);
+	if (size == (size_t) -1)       /* This shouldn't happen, but for
+					 * sure */
+		return 0;
+	if (wc == L'\0') {
+		/* The following condition must be always true, but for sure */
+		if (size > 0 && buf[size - 1] == '\0')
+			--size;
+	}
+	if (size > 0)
+		fwrite(buf, 1, size, stdout);
+	return wc == L'\0' ? 0 : wcwidth(wc);
+}
+
 int
-putname(char *name)
+putname(char *src)
 {
-	int len;
+	int             n = 0;
+	mbstate_t       src_state, stdout_state;
+	/* The following +1 is to pass '\0' at the end of src to mbrtowc(). */
+	const char     *endptr = src + strlen(src) + 1;

-	for (len = 0; *name; len++, name++)
-		putchar((!isprint(*name) && f_nonprint) ? '?' : *name);
-	return len;
+	/*
+	* We have to reset src_state each time in this function, because
+	* the codeset of src pathname may not match with current locale.
+	* Note that if we pass NULL instead of src_state to mbrtowc(),
+	* there is no way to reset the state.
+	*/
+	memset(&src_state, 0, sizeof(src_state));
+	memset(&stdout_state, 0, sizeof(stdout_state));
+	while (src < endptr) {
+		wchar_t         wc;
+		size_t          rv, span = endptr - src;
+		rv = mbrtowc(&wc, src, span, &src_state);
+		if (rv == 0) {  /* assert(wc == L'\0'); */
+			/* The following may output a shift sequence. */
+			n += printwc(wc, &stdout_state);
+			break;
+		}
+		if (rv == (size_t) -1) {       /* probably errno == EILSEQ */
+			n += printwc(L'?', &stdout_state);
+			/* try to skip 1byte, because there is no better way */
+			src++;
+			memset(&src_state, 0, sizeof(src_state));
+		} else if (rv == (size_t) - 2) {
+			if (span < MB_CUR_MAX) {        /* incomplete char */
+				n += printwc(L'?', &stdout_state);
+				break;
+			}
+			src += span;    /* a redundant shift sequence? */
+		} else {
+			n += printwc(iswprint(wc) ? wc : L'?', &stdout_state);
+			src += rv;
+		}
+	}
+	return n;
 }

 void
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
UTF-8 (was: Re: CVS: cvs.openbsd.org: src), Christian Weisgerber, (Wed Jul 28, 7:58 am)
Re: UTF-8 (was: Re: CVS: cvs.openbsd.org: src), Christian Weisgerber, (Wed Jul 28, 12:45 pm)
Re: UTF-8, Jordi Beltran Creix, (Wed Aug 4, 6:22 am)
Re: UTF-8, Matthew Dempsky, (Wed Aug 4, 1:36 pm)
Re: UTF-8, Jordi Beltran Creix, (Wed Aug 4, 6:20 pm)
Re: UTF-8, Matthew Szudzik, (Thu Aug 5, 5:49 am)
Re: UTF-8, Philip Guenther, (Thu Aug 5, 11:33 am)
Re: UTF-8, Matthew Szudzik, (Thu Aug 5, 12:50 pm)
Re: UTF-8, Kevin Chadwick, (Thu Aug 5, 3:05 pm)
Re: UTF-8, Philip Guenther, (Thu Aug 5, 7:40 pm)
Re: UTF-8, Marco Peereboom, (Thu Aug 5, 8:06 pm)
Re: UTF-8, Dmitrij D. Czarkoff, (Thu Aug 5, 10:28 pm)
Re: UTF-8, Marc Espie, (Fri Aug 6, 4:31 am)
Re: UTF-8, STeve Andre', (Fri Aug 6, 6:18 am)
Re: UTF-8, Kevin Chadwick, (Fri Aug 6, 6:52 am)
Re: UTF-8, Alexander Polakov, (Sat Aug 7, 7:52 am)
Re: UTF-8, Ingo Schwarze, (Sat Aug 7, 9:47 am)