Algorithm to convert katakana to hiragana in UTF-8
This is an example C program which gives an algorithm for
converting katakana to hiragana in the UTF-8 encoding. The input
is katakana
and the output is in allocated memory, so it
needs to be freed with free
after use.
#include <stdlib.h> #include <string.h> #include <stdio.h> int kata2hira (const unsigned char * katakana, unsigned char ** hiragana_ptr) { unsigned char * h; unsigned char * hiragana; hiragana = (unsigned char *) strdup ((const char *)katakana); h = hiragana; while (* h) { /* Check that this is within the katakana block from E3 82 A0 to E3 83 BF. */ if (h[0] == 0xe3 && (h[1] == 0x82 || h[1] == 0x83) && h[2] != '\0') { /* Check that this is within the range of katakana which can be converted into hiragana. */ if ((h[1] == 0x82 && h[2] >= 0xa1) || (h[1] == 0x83 && h[2] <= 0xb6) || (h[1] == 0x83 && (h[2] == 0xbd || h[2] == 0xbe))) { /* Byte conversion from katakana to hiragana. */ if (h[2] >= 0xa0) { h[1] = h[1] - 1; h[2] -= 0x20; } else { h[1] = h[1] - 2; h[2] += 0x20; } } h += 3; } else { h++; } } * hiragana_ptr = hiragana; return 0; } int main () { int i; const char * tests[] = { "サク:k] 3211 [ノミ:h] 453 [ウガツ:h] ", "钁: 886 [カク:k] 1210 [キャク:k] ", "鸚: 437 [イン:k] 644 [オウ:k] 4403 [ヨウ:k] ", "鸛: 1101 [カン:k] 1500 [コウノトリ:h] ", "鬱: 514 [ウツ:k] 512 [ウッスル:h] 3613 [フサグ:h] 1853 [シゲル:h] ", "爨: 1791 [サン:k] 922 [カシグ:h] 1035 [カマド:h] ", /* First row of unicode katakana. */ "゠ァアィイゥウェエォオカガキギク", /* Last row of unicode katakana. */ "ヰヱヲンヴヵヶヷヸヹヺ・ーヽヾヿ", }; int n_tests = sizeof (tests) / sizeof (tests[0]); for (i = 0; i < n_tests; i++) { unsigned char * hiragana; kata2hira ((const unsigned char*) tests[i], & hiragana); printf ("K: %s\nH: %s\n", tests[i], hiragana); free (hiragana); } return 0; }
The output of the example looks like this:
K: サク:k] 3211 [ノミ:h] 453 [ウガツ:h] H: さく:k] 3211 [のみ:h] 453 [うがつ:h] K: 钁: 886 [カク:k] 1210 [キャク:k] H: 钁: 886 [かく:k] 1210 [きゃく:k] K: 鸚: 437 [イン:k] 644 [オウ:k] 4403 [ヨウ:k] H: 鸚: 437 [いん:k] 644 [おう:k] 4403 [よう:k] K: 鸛: 1101 [カン:k] 1500 [コウノトリ:h] H: 鸛: 1101 [かん:k] 1500 [こうのとり:h] K: 鬱: 514 [ウツ:k] 512 [ウッスル:h] 3613 [フサグ:h] 1853 [シゲル:h] H: 鬱: 514 [うつ:k] 512 [うっする:h] 3613 [ふさぐ:h] 1853 [しげる:h] K: 爨: 1791 [サン:k] 922 [カシグ:h] 1035 [カマド:h] H: 爨: 1791 [さん:k] 922 [かしぐ:h] 1035 [かまど:h] K: ゠ァアィイゥウェエォオカガキギク H: ゠ぁあぃいぅうぇえぉおかがきぎく K: ヰヱヲンヴヵヶヷヸヹヺ・ーヽヾヿ H: ゐゑをんゔゕゖヷヸヹヺ・ーゝゞヿ
Copyright © Ben Bullock 2009-2024. All
rights reserved.
For comments, questions, and corrections, please email
Ben Bullock
(benkasminbullock@gmail.com).
/
Privacy /
Disclaimer