Algorithm to convert katakana to hiragana in UTF-8

This is an example C program which gives an algorithm for converting katakana to hiragana in the UTF-8 encoding. The input is katakana and the output is in allocated memory, so it needs to be freed with free after use.

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

int kata2hira (const unsigned char * katakana, unsigned char ** hiragana_ptr)
{
    unsigned char * h;
    unsigned char * hiragana;

    hiragana = (unsigned char *) strdup ((const char *)katakana);
    h = hiragana;
    while (* h) {
        /* Check that this is within the katakana block from E3 82 A0
           to E3 83 BF. */
        if (h[0] == 0xe3 && (h[1] == 0x82 || h[1] == 0x83) && h[2] != '\0') {
            /* Check that this is within the range of katakana which
               can be converted into hiragana. */
            if ((h[1] == 0x82 && h[2] >= 0xa1) ||
                (h[1] == 0x83 && h[2] <= 0xb6) ||
                (h[1] == 0x83 && (h[2] == 0xbd || h[2] == 0xbe))) {
                /* Byte conversion from katakana to hiragana. */
                if (h[2] >= 0xa0) {
                    h[1] = h[1] - 1;
                    h[2] -= 0x20;
                }
                else {
                    h[1] = h[1] - 2;
                    h[2] += 0x20;
                }
            }
            h += 3;
        }
        else {
            h++;
        }
    }
    * hiragana_ptr = hiragana;
    return 0;
}

int main ()
{
    int i;
    const char * tests[] = {
        "サク:k] 3211 [ノミ:h] 453 [ウガツ:h] ",
        "钁: 886 [カク:k] 1210 [キャク:k] ",
        "鸚: 437 [イン:k] 644 [オウ:k] 4403 [ヨウ:k] ",
        "鸛: 1101 [カン:k] 1500 [コウノトリ:h] ",
        "鬱: 514 [ウツ:k] 512 [ウッスル:h] 3613 [フサグ:h] 1853 [シゲル:h] ",
        "爨: 1791 [サン:k] 922 [カシグ:h] 1035 [カマド:h] ",
        /* First row of unicode katakana. */
        "゠ァアィイゥウェエォオカガキギク",
        /* Last row of unicode katakana. */
        "ヰヱヲンヴヵヶヷヸヹヺ・ーヽヾヿ",
    };
    int n_tests = sizeof (tests) / sizeof (tests[0]);
    for (i = 0; i < n_tests; i++) {
        unsigned char * hiragana;
        kata2hira ((const unsigned char*) tests[i], & hiragana);
        printf ("K: %s\nH: %s\n", tests[i], hiragana);
        free (hiragana);
    }
    return 0;
}

(download)

The output of the example looks like this:

K: サク:k] 3211 [ノミ:h] 453 [ウガツ:h] 
H: さく:k] 3211 [のみ:h] 453 [うがつ:h] 
K: 钁: 886 [カク:k] 1210 [キャク:k] 
H: 钁: 886 [かく:k] 1210 [きゃく:k] 
K: 鸚: 437 [イン:k] 644 [オウ:k] 4403 [ヨウ:k] 
H: 鸚: 437 [いん:k] 644 [おう:k] 4403 [よう:k] 
K: 鸛: 1101 [カン:k] 1500 [コウノトリ:h] 
H: 鸛: 1101 [かん:k] 1500 [こうのとり:h] 
K: 鬱: 514 [ウツ:k] 512 [ウッスル:h] 3613 [フサグ:h] 1853 [シゲル:h] 
H: 鬱: 514 [うつ:k] 512 [うっする:h] 3613 [ふさぐ:h] 1853 [しげる:h] 
K: 爨: 1791 [サン:k] 922 [カシグ:h] 1035 [カマド:h] 
H: 爨: 1791 [さん:k] 922 [かしぐ:h] 1035 [かまど:h] 
K: ゠ァアィイゥウェエォオカガキギク
H: ゠ぁあぃいぅうぇえぉおかがきぎく
K: ヰヱヲンヴヵヶヷヸヹヺ・ーヽヾヿ
H: ゐゑをんゔゕゖヷヸヹヺ・ーゝゞヿ


Copyright © Ben Bullock 2009-2024. All rights reserved. For comments, questions, and corrections, please email Ben Bullock (benkasminbullock@gmail.com). / Privacy / Disclaimer