Unicode characters not shown correctly

258 Views Asked by At

I am making a C program that supports many languages. The program send emails using the type WCHAR instead of char. The problem is that when I receive the email and read it, some characters are not shown correctly, even some English ones like e, m, ... This is an example:

<!-- language: lang-c -->
curl_easy_setopt(hnd, CURLOPT_READFUNCTION, payload_source);
curl_easy_setopt(hnd, CURLOPT_READDATA, &upload_ctx);

static const WCHAR *payload_text[]={
    L"To: <[email protected]>\n",
    L"From: <[email protected]>(Example User)\n",
    L"Subject: Hello!\n",
    L"\n",
    L"Message sent\n",
    NULL
};

struct upload_status {
    int lines_read;
};

static size_t payload_source(void *ptr, size_t size, size_t nmemb, void *userp){
    struct upload_status *upload_ctx = (struct upload_status *)userp;
    const WCHAR *data;

    if ((size == 0) || (nmemb == 0) || ((size*nmemb) < 1)) {
        return 0;
    }

    data = payload_text[upload_ctx->lines_read];
    if (data) {
        size_t len = wcslen(data);
        memcpy(ptr, data, len);
        upload_ctx->lines_read ++;
        return len;
    }
    return 0;
}
1

There are 1 best solutions below

0
On

memcpy() operates on bytes, not on characters. You are not taking into account that sizeof(wchar_t) > 1. It is 2 bytes on some systems and 4 bytes on others. This descrepency makes wchar_t a bad choice when writing portable code. You should be using a Unicode library instead, such as icu or iconv).

You need to take sizeof(wchar_t) into account when calling memcpy(). You also need to take into account that the destination buffer may be smaller than the size of the text bytes you are trying to copy. Keeping track of the lines_read by itself is not enough, you have to also keep track of how many bytes of the current line you have copied so you can handle cases when the current line of text straddles across multiple destination buffers.

Try something more like this instead:

static size_t payload_source(void *ptr, size_t size, size_t nmemb, void *userp)
{
    struct upload_status *upload_ctx = (struct upload_status *) userp;
    unsigned char *buf = (unsignd char *) ptr;
    size_t available = (size * nmemb);
    size_t total = 0;

    while (available > 0)
    {
        wchar_t *data = payload_text[upload_ctx->lines_read];
        if (!data) break;

        unsigned char *rawdata = (unsigned char *) data;

        size_t remaining = (wcslen(data) * sizeof(wchar_t)) - upload_ctx->line_bytes_read;
        while ((remaining > 0) && (available > 0))
        {
            size_t bytes_to_copy = min(remaining, available);
            memcpy(buf, rawdata, bytes_to_copy);

            buf += bytes_to_copy;
            available -= bytes_to_copy;
            total = bytes_to_copy;

            rawdata += bytes_to_copy;
            remaining -= bytes_to_copy;

            upload_ctx->line_bytes_read += bytes_to_copy;
        }

        if (remaining < 1)
        {
            upload_ctx->lines_read ++;
            upload_ctx->line_bytes_read = 0;
        }
    }

    return total;
}