mirror of https://github.com/python/cpython.git
1132 lines
33 KiB
C
1132 lines
33 KiB
C
/*
|
|
* _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
|
|
*
|
|
* Written by Hye-Shik Chang <perky@FreeBSD.org>
|
|
*/
|
|
|
|
#define USING_IMPORTED_MAPS
|
|
#define USING_BINARY_PAIR_SEARCH
|
|
#define EXTERN_JISX0213_PAIR
|
|
#define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
|
|
#define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
|
|
|
|
#include "cjkcodecs.h"
|
|
#include "alg_jisx0201.h"
|
|
#include "emu_jisx0213_2000.h"
|
|
#include "mappings_jisx0213_pair.h"
|
|
|
|
/* STATE
|
|
|
|
state->c[0-3]
|
|
|
|
00000000
|
|
||^^^^^|
|
|
|+-----+---- G0-3 Character Set
|
|
+----------- Is G0-3 double byte?
|
|
|
|
state->c[4]
|
|
|
|
00000000
|
|
||
|
|
|+---- Locked-Shift?
|
|
+----- ESC Throughout
|
|
*/
|
|
|
|
#define ESC 0x1B
|
|
#define SO 0x0E
|
|
#define SI 0x0F
|
|
#define LF 0x0A
|
|
|
|
#define MAX_ESCSEQLEN 16
|
|
|
|
#define CHARSET_ISO8859_1 'A'
|
|
#define CHARSET_ASCII 'B'
|
|
#define CHARSET_ISO8859_7 'F'
|
|
#define CHARSET_JISX0201_K 'I'
|
|
#define CHARSET_JISX0201_R 'J'
|
|
|
|
#define CHARSET_GB2312 ('A'|CHARSET_DBCS)
|
|
#define CHARSET_JISX0208 ('B'|CHARSET_DBCS)
|
|
#define CHARSET_KSX1001 ('C'|CHARSET_DBCS)
|
|
#define CHARSET_JISX0212 ('D'|CHARSET_DBCS)
|
|
#define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS)
|
|
#define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS)
|
|
#define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS)
|
|
#define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
|
|
#define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS)
|
|
#define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
|
|
#define CHARSET_JISX0208_O ('@'|CHARSET_DBCS)
|
|
|
|
#define CHARSET_DBCS 0x80
|
|
#define ESCMARK(mark) ((mark) & 0x7f)
|
|
|
|
#define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
|
|
#define IS_ISO2022ESC(c2) \
|
|
((c2) == '(' || (c2) == ')' || (c2) == '$' || \
|
|
(c2) == '.' || (c2) == '&')
|
|
/* this is not a complete list of ISO-2022 escape sequence headers.
|
|
* but, it's enough to implement CJK instances of iso-2022. */
|
|
|
|
#define MAP_UNMAPPABLE 0xFFFF
|
|
#define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */
|
|
|
|
#define F_SHIFTED 0x01
|
|
#define F_ESCTHROUGHOUT 0x02
|
|
|
|
#define STATE_SETG(dn, v) ((state)->c[dn]) = (v);
|
|
#define STATE_GETG(dn) ((state)->c[dn])
|
|
|
|
#define STATE_G0 STATE_GETG(0)
|
|
#define STATE_G1 STATE_GETG(1)
|
|
#define STATE_G2 STATE_GETG(2)
|
|
#define STATE_G3 STATE_GETG(3)
|
|
#define STATE_SETG0(v) STATE_SETG(0, v)
|
|
#define STATE_SETG1(v) STATE_SETG(1, v)
|
|
#define STATE_SETG2(v) STATE_SETG(2, v)
|
|
#define STATE_SETG3(v) STATE_SETG(3, v)
|
|
|
|
#define STATE_SETFLAG(f) ((state)->c[4]) |= (f);
|
|
#define STATE_GETFLAG(f) ((state)->c[4] & (f))
|
|
#define STATE_CLEARFLAG(f) ((state)->c[4]) &= ~(f);
|
|
#define STATE_CLEARFLAGS() ((state)->c[4]) = 0;
|
|
|
|
#define ISO2022_CONFIG ((const struct iso2022_config *)config)
|
|
#define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag))
|
|
#define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations)
|
|
|
|
/* iso2022_config.flags */
|
|
#define NO_SHIFT 0x01
|
|
#define USE_G2 0x02
|
|
#define USE_JISX0208_EXT 0x04
|
|
|
|
/*-*- internal data structures -*-*/
|
|
|
|
typedef int (*iso2022_init_func)(void);
|
|
typedef ucs4_t (*iso2022_decode_func)(const unsigned char *data);
|
|
typedef DBCHAR (*iso2022_encode_func)(const ucs4_t *data, Py_ssize_t *length);
|
|
|
|
struct iso2022_designation {
|
|
unsigned char mark;
|
|
unsigned char plane;
|
|
unsigned char width;
|
|
iso2022_init_func initializer;
|
|
iso2022_decode_func decoder;
|
|
iso2022_encode_func encoder;
|
|
};
|
|
|
|
struct iso2022_config {
|
|
int flags;
|
|
const struct iso2022_designation *designations; /* non-ascii desigs */
|
|
};
|
|
|
|
/*-*- iso-2022 codec implementation -*-*/
|
|
|
|
CODEC_INIT(iso2022)
|
|
{
|
|
const struct iso2022_designation *desig = CONFIG_DESIGNATIONS;
|
|
for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
|
|
if (desig->initializer != NULL && desig->initializer() != 0)
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|
|
ENCODER_INIT(iso2022)
|
|
{
|
|
STATE_CLEARFLAGS()
|
|
STATE_SETG0(CHARSET_ASCII)
|
|
STATE_SETG1(CHARSET_ASCII)
|
|
return 0;
|
|
}
|
|
|
|
ENCODER_RESET(iso2022)
|
|
{
|
|
if (STATE_GETFLAG(F_SHIFTED)) {
|
|
WRITE1(SI)
|
|
NEXT_OUT(1)
|
|
STATE_CLEARFLAG(F_SHIFTED)
|
|
}
|
|
if (STATE_G0 != CHARSET_ASCII) {
|
|
WRITE3(ESC, '(', 'B')
|
|
NEXT_OUT(3)
|
|
STATE_SETG0(CHARSET_ASCII)
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
ENCODER(iso2022)
|
|
{
|
|
while (inleft > 0) {
|
|
const struct iso2022_designation *dsg;
|
|
DBCHAR encoded;
|
|
ucs4_t c = **inbuf;
|
|
Py_ssize_t insize;
|
|
|
|
if (c < 0x80) {
|
|
if (STATE_G0 != CHARSET_ASCII) {
|
|
WRITE3(ESC, '(', 'B')
|
|
STATE_SETG0(CHARSET_ASCII)
|
|
NEXT_OUT(3)
|
|
}
|
|
if (STATE_GETFLAG(F_SHIFTED)) {
|
|
WRITE1(SI)
|
|
STATE_CLEARFLAG(F_SHIFTED)
|
|
NEXT_OUT(1)
|
|
}
|
|
WRITE1((unsigned char)c)
|
|
NEXT(1, 1)
|
|
continue;
|
|
}
|
|
|
|
DECODE_SURROGATE(c)
|
|
insize = GET_INSIZE(c);
|
|
|
|
encoded = MAP_UNMAPPABLE;
|
|
for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
|
|
Py_ssize_t length = 1;
|
|
encoded = dsg->encoder(&c, &length);
|
|
if (encoded == MAP_MULTIPLE_AVAIL) {
|
|
/* this implementation won't work for pair
|
|
* of non-bmp characters. */
|
|
if (inleft < 2) {
|
|
if (!(flags & MBENC_FLUSH))
|
|
return MBERR_TOOFEW;
|
|
length = -1;
|
|
}
|
|
else
|
|
length = 2;
|
|
#if Py_UNICODE_SIZE == 2
|
|
if (length == 2) {
|
|
ucs4_t u4in[2];
|
|
u4in[0] = (ucs4_t)IN1;
|
|
u4in[1] = (ucs4_t)IN2;
|
|
encoded = dsg->encoder(u4in, &length);
|
|
} else
|
|
encoded = dsg->encoder(&c, &length);
|
|
#else
|
|
encoded = dsg->encoder(&c, &length);
|
|
#endif
|
|
if (encoded != MAP_UNMAPPABLE) {
|
|
insize = length;
|
|
break;
|
|
}
|
|
}
|
|
else if (encoded != MAP_UNMAPPABLE)
|
|
break;
|
|
}
|
|
|
|
if (!dsg->mark)
|
|
return 1;
|
|
assert(dsg->width == 1 || dsg->width == 2);
|
|
|
|
switch (dsg->plane) {
|
|
case 0: /* G0 */
|
|
if (STATE_GETFLAG(F_SHIFTED)) {
|
|
WRITE1(SI)
|
|
STATE_CLEARFLAG(F_SHIFTED)
|
|
NEXT_OUT(1)
|
|
}
|
|
if (STATE_G0 != dsg->mark) {
|
|
if (dsg->width == 1) {
|
|
WRITE3(ESC, '(', ESCMARK(dsg->mark))
|
|
STATE_SETG0(dsg->mark)
|
|
NEXT_OUT(3)
|
|
}
|
|
else if (dsg->mark == CHARSET_JISX0208) {
|
|
WRITE3(ESC, '$', ESCMARK(dsg->mark))
|
|
STATE_SETG0(dsg->mark)
|
|
NEXT_OUT(3)
|
|
}
|
|
else {
|
|
WRITE4(ESC, '$', '(',
|
|
ESCMARK(dsg->mark))
|
|
STATE_SETG0(dsg->mark)
|
|
NEXT_OUT(4)
|
|
}
|
|
}
|
|
break;
|
|
case 1: /* G1 */
|
|
if (STATE_G1 != dsg->mark) {
|
|
if (dsg->width == 1) {
|
|
WRITE3(ESC, ')', ESCMARK(dsg->mark))
|
|
STATE_SETG1(dsg->mark)
|
|
NEXT_OUT(3)
|
|
}
|
|
else {
|
|
WRITE4(ESC, '$', ')',
|
|
ESCMARK(dsg->mark))
|
|
STATE_SETG1(dsg->mark)
|
|
NEXT_OUT(4)
|
|
}
|
|
}
|
|
if (!STATE_GETFLAG(F_SHIFTED)) {
|
|
WRITE1(SO)
|
|
STATE_SETFLAG(F_SHIFTED)
|
|
NEXT_OUT(1)
|
|
}
|
|
break;
|
|
default: /* G2 and G3 is not supported: no encoding in
|
|
* CJKCodecs are using them yet */
|
|
return MBERR_INTERNAL;
|
|
}
|
|
|
|
if (dsg->width == 1) {
|
|
WRITE1((unsigned char)encoded)
|
|
NEXT_OUT(1)
|
|
}
|
|
else {
|
|
WRITE2(encoded >> 8, encoded & 0xff)
|
|
NEXT_OUT(2)
|
|
}
|
|
NEXT_IN(insize)
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
DECODER_INIT(iso2022)
|
|
{
|
|
STATE_CLEARFLAGS()
|
|
STATE_SETG0(CHARSET_ASCII)
|
|
STATE_SETG1(CHARSET_ASCII)
|
|
STATE_SETG2(CHARSET_ASCII)
|
|
return 0;
|
|
}
|
|
|
|
DECODER_RESET(iso2022)
|
|
{
|
|
STATE_SETG0(CHARSET_ASCII)
|
|
STATE_CLEARFLAG(F_SHIFTED)
|
|
return 0;
|
|
}
|
|
|
|
static Py_ssize_t
|
|
iso2022processesc(const void *config, MultibyteCodec_State *state,
|
|
const unsigned char **inbuf, Py_ssize_t *inleft)
|
|
{
|
|
unsigned char charset, designation;
|
|
Py_ssize_t i, esclen;
|
|
|
|
for (i = 1;i < MAX_ESCSEQLEN;i++) {
|
|
if (i >= *inleft)
|
|
return MBERR_TOOFEW;
|
|
if (IS_ESCEND((*inbuf)[i])) {
|
|
esclen = i + 1;
|
|
break;
|
|
}
|
|
else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
|
|
(*inbuf)[i] == '&' && (*inbuf)[i+1] == '@')
|
|
i += 2;
|
|
}
|
|
|
|
if (i >= MAX_ESCSEQLEN)
|
|
return 1; /* unterminated escape sequence */
|
|
|
|
switch (esclen) {
|
|
case 3:
|
|
if (IN2 == '$') {
|
|
charset = IN3 | CHARSET_DBCS;
|
|
designation = 0;
|
|
}
|
|
else {
|
|
charset = IN3;
|
|
if (IN2 == '(') designation = 0;
|
|
else if (IN2 == ')') designation = 1;
|
|
else if (CONFIG_ISSET(USE_G2) && IN2 == '.')
|
|
designation = 2;
|
|
else return 3;
|
|
}
|
|
break;
|
|
case 4:
|
|
if (IN2 != '$')
|
|
return 4;
|
|
|
|
charset = IN4 | CHARSET_DBCS;
|
|
if (IN3 == '(') designation = 0;
|
|
else if (IN3 == ')') designation = 1;
|
|
else return 4;
|
|
break;
|
|
case 6: /* designation with prefix */
|
|
if (CONFIG_ISSET(USE_JISX0208_EXT) &&
|
|
(*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
|
|
(*inbuf)[5] == 'B') {
|
|
charset = 'B' | CHARSET_DBCS;
|
|
designation = 0;
|
|
}
|
|
else
|
|
return 6;
|
|
break;
|
|
default:
|
|
return esclen;
|
|
}
|
|
|
|
/* raise error when the charset is not designated for this encoding */
|
|
if (charset != CHARSET_ASCII) {
|
|
const struct iso2022_designation *dsg;
|
|
|
|
for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++)
|
|
if (dsg->mark == charset)
|
|
break;
|
|
if (!dsg->mark)
|
|
return esclen;
|
|
}
|
|
|
|
STATE_SETG(designation, charset)
|
|
*inleft -= esclen;
|
|
(*inbuf) += esclen;
|
|
return 0;
|
|
}
|
|
|
|
#define ISO8859_7_DECODE(c, assi) \
|
|
if ((c) < 0xa0) (assi) = (c); \
|
|
else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) \
|
|
(assi) = (c); \
|
|
else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \
|
|
(0xbffffd77L & (1L << ((c)-0xb4))))) \
|
|
(assi) = 0x02d0 + (c); \
|
|
else if ((c) == 0xa1) (assi) = 0x2018; \
|
|
else if ((c) == 0xa2) (assi) = 0x2019; \
|
|
else if ((c) == 0xaf) (assi) = 0x2015;
|
|
|
|
static Py_ssize_t
|
|
iso2022processg2(const void *config, MultibyteCodec_State *state,
|
|
const unsigned char **inbuf, Py_ssize_t *inleft,
|
|
Py_UNICODE **outbuf, Py_ssize_t *outleft)
|
|
{
|
|
/* not written to use encoder, decoder functions because only few
|
|
* encodings use G2 designations in CJKCodecs */
|
|
if (STATE_G2 == CHARSET_ISO8859_1) {
|
|
if (IN3 < 0x80)
|
|
OUT1(IN3 + 0x80)
|
|
else
|
|
return 3;
|
|
}
|
|
else if (STATE_G2 == CHARSET_ISO8859_7) {
|
|
ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf)
|
|
else return 3;
|
|
}
|
|
else if (STATE_G2 == CHARSET_ASCII) {
|
|
if (IN3 & 0x80) return 3;
|
|
else **outbuf = IN3;
|
|
}
|
|
else
|
|
return MBERR_INTERNAL;
|
|
|
|
(*inbuf) += 3;
|
|
*inleft -= 3;
|
|
(*outbuf) += 1;
|
|
*outleft -= 1;
|
|
return 0;
|
|
}
|
|
|
|
DECODER(iso2022)
|
|
{
|
|
const struct iso2022_designation *dsgcache = NULL;
|
|
|
|
while (inleft > 0) {
|
|
unsigned char c = IN1;
|
|
Py_ssize_t err;
|
|
|
|
if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
|
|
/* ESC throughout mode:
|
|
* for non-iso2022 escape sequences */
|
|
WRITE1(c) /* assume as ISO-8859-1 */
|
|
NEXT(1, 1)
|
|
if (IS_ESCEND(c)) {
|
|
STATE_CLEARFLAG(F_ESCTHROUGHOUT)
|
|
}
|
|
continue;
|
|
}
|
|
|
|
switch (c) {
|
|
case ESC:
|
|
REQUIRE_INBUF(2)
|
|
if (IS_ISO2022ESC(IN2)) {
|
|
err = iso2022processesc(config, state,
|
|
inbuf, &inleft);
|
|
if (err != 0)
|
|
return err;
|
|
}
|
|
else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */
|
|
REQUIRE_INBUF(3)
|
|
err = iso2022processg2(config, state,
|
|
inbuf, &inleft, outbuf, &outleft);
|
|
if (err != 0)
|
|
return err;
|
|
}
|
|
else {
|
|
WRITE1(ESC)
|
|
STATE_SETFLAG(F_ESCTHROUGHOUT)
|
|
NEXT(1, 1)
|
|
}
|
|
break;
|
|
case SI:
|
|
if (CONFIG_ISSET(NO_SHIFT))
|
|
goto bypass;
|
|
STATE_CLEARFLAG(F_SHIFTED)
|
|
NEXT_IN(1)
|
|
break;
|
|
case SO:
|
|
if (CONFIG_ISSET(NO_SHIFT))
|
|
goto bypass;
|
|
STATE_SETFLAG(F_SHIFTED)
|
|
NEXT_IN(1)
|
|
break;
|
|
case LF:
|
|
STATE_CLEARFLAG(F_SHIFTED)
|
|
WRITE1(LF)
|
|
NEXT(1, 1)
|
|
break;
|
|
default:
|
|
if (c < 0x20) /* C0 */
|
|
goto bypass;
|
|
else if (c >= 0x80)
|
|
return 1;
|
|
else {
|
|
const struct iso2022_designation *dsg;
|
|
unsigned char charset;
|
|
ucs4_t decoded;
|
|
|
|
if (STATE_GETFLAG(F_SHIFTED))
|
|
charset = STATE_G1;
|
|
else
|
|
charset = STATE_G0;
|
|
|
|
if (charset == CHARSET_ASCII) {
|
|
bypass: WRITE1(c)
|
|
NEXT(1, 1)
|
|
break;
|
|
}
|
|
|
|
if (dsgcache != NULL &&
|
|
dsgcache->mark == charset)
|
|
dsg = dsgcache;
|
|
else {
|
|
for (dsg = CONFIG_DESIGNATIONS;
|
|
dsg->mark != charset
|
|
#ifdef Py_DEBUG
|
|
&& dsg->mark != '\0'
|
|
#endif
|
|
;dsg++)
|
|
/* noop */;
|
|
assert(dsg->mark != '\0');
|
|
dsgcache = dsg;
|
|
}
|
|
|
|
REQUIRE_INBUF(dsg->width)
|
|
decoded = dsg->decoder(*inbuf);
|
|
if (decoded == MAP_UNMAPPABLE)
|
|
return dsg->width;
|
|
|
|
if (decoded < 0x10000) {
|
|
WRITE1(decoded)
|
|
NEXT_OUT(1)
|
|
}
|
|
else if (decoded < 0x30000) {
|
|
WRITEUCS4(decoded)
|
|
}
|
|
else { /* JIS X 0213 pairs */
|
|
WRITE2(decoded >> 16, decoded & 0xffff)
|
|
NEXT_OUT(2)
|
|
}
|
|
NEXT_IN(dsg->width)
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*-*- mapping table holders -*-*/
|
|
|
|
#define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
|
|
#define DECMAP(enc) static const decode_map *enc##_decmap = NULL;
|
|
|
|
/* kr */
|
|
ENCMAP(cp949)
|
|
DECMAP(ksx1001)
|
|
|
|
/* jp */
|
|
ENCMAP(jisxcommon)
|
|
DECMAP(jisx0208)
|
|
DECMAP(jisx0212)
|
|
ENCMAP(jisx0213_bmp)
|
|
DECMAP(jisx0213_1_bmp)
|
|
DECMAP(jisx0213_2_bmp)
|
|
ENCMAP(jisx0213_emp)
|
|
DECMAP(jisx0213_1_emp)
|
|
DECMAP(jisx0213_2_emp)
|
|
|
|
/* cn */
|
|
ENCMAP(gbcommon)
|
|
DECMAP(gb2312)
|
|
|
|
/* tw */
|
|
|
|
/*-*- mapping access functions -*-*/
|
|
|
|
static int
|
|
ksx1001_init(void)
|
|
{
|
|
static int initialized = 0;
|
|
|
|
if (!initialized && (
|
|
IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
|
|
IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
|
|
return -1;
|
|
initialized = 1;
|
|
return 0;
|
|
}
|
|
|
|
static ucs4_t
|
|
ksx1001_decoder(const unsigned char *data)
|
|
{
|
|
ucs4_t u;
|
|
TRYMAP_DEC(ksx1001, u, data[0], data[1])
|
|
return u;
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
static DBCHAR
|
|
ksx1001_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded;
|
|
assert(*length == 1);
|
|
if (*data < 0x10000) {
|
|
TRYMAP_ENC(cp949, coded, *data)
|
|
if (!(coded & 0x8000))
|
|
return coded;
|
|
}
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
static int
|
|
jisx0208_init(void)
|
|
{
|
|
static int initialized = 0;
|
|
|
|
if (!initialized && (
|
|
IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
|
|
IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
|
|
return -1;
|
|
initialized = 1;
|
|
return 0;
|
|
}
|
|
|
|
static ucs4_t
|
|
jisx0208_decoder(const unsigned char *data)
|
|
{
|
|
ucs4_t u;
|
|
if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
|
|
return 0xff3c;
|
|
else TRYMAP_DEC(jisx0208, u, data[0], data[1])
|
|
return u;
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0208_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded;
|
|
assert(*length == 1);
|
|
if (*data < 0x10000) {
|
|
if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
|
|
return 0x2140;
|
|
else TRYMAP_ENC(jisxcommon, coded, *data) {
|
|
if (!(coded & 0x8000))
|
|
return coded;
|
|
}
|
|
}
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
static int
|
|
jisx0212_init(void)
|
|
{
|
|
static int initialized = 0;
|
|
|
|
if (!initialized && (
|
|
IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
|
|
IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
|
|
return -1;
|
|
initialized = 1;
|
|
return 0;
|
|
}
|
|
|
|
static ucs4_t
|
|
jisx0212_decoder(const unsigned char *data)
|
|
{
|
|
ucs4_t u;
|
|
TRYMAP_DEC(jisx0212, u, data[0], data[1])
|
|
return u;
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0212_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded;
|
|
assert(*length == 1);
|
|
if (*data < 0x10000) {
|
|
TRYMAP_ENC(jisxcommon, coded, *data) {
|
|
if (coded & 0x8000)
|
|
return coded & 0x7fff;
|
|
}
|
|
}
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
static int
|
|
jisx0213_init(void)
|
|
{
|
|
static int initialized = 0;
|
|
|
|
if (!initialized && (
|
|
jisx0208_init() ||
|
|
IMPORT_MAP(jp, jisx0213_bmp,
|
|
&jisx0213_bmp_encmap, NULL) ||
|
|
IMPORT_MAP(jp, jisx0213_1_bmp,
|
|
NULL, &jisx0213_1_bmp_decmap) ||
|
|
IMPORT_MAP(jp, jisx0213_2_bmp,
|
|
NULL, &jisx0213_2_bmp_decmap) ||
|
|
IMPORT_MAP(jp, jisx0213_emp,
|
|
&jisx0213_emp_encmap, NULL) ||
|
|
IMPORT_MAP(jp, jisx0213_1_emp,
|
|
NULL, &jisx0213_1_emp_decmap) ||
|
|
IMPORT_MAP(jp, jisx0213_2_emp,
|
|
NULL, &jisx0213_2_emp_decmap) ||
|
|
IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
|
|
&jisx0213_pair_decmap)))
|
|
return -1;
|
|
initialized = 1;
|
|
return 0;
|
|
}
|
|
|
|
#define config ((void *)2000)
|
|
static ucs4_t
|
|
jisx0213_2000_1_decoder(const unsigned char *data)
|
|
{
|
|
ucs4_t u;
|
|
EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
|
|
else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
|
|
return 0xff3c;
|
|
else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
|
|
else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
|
|
else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
|
|
u |= 0x20000;
|
|
else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
return u;
|
|
}
|
|
|
|
static ucs4_t
|
|
jisx0213_2000_2_decoder(const unsigned char *data)
|
|
{
|
|
ucs4_t u;
|
|
EMULATE_JISX0213_2000_DECODE_PLANE2(u, data[0], data[1])
|
|
TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
|
|
else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
|
|
u |= 0x20000;
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
return u;
|
|
}
|
|
#undef config
|
|
|
|
static ucs4_t
|
|
jisx0213_2004_1_decoder(const unsigned char *data)
|
|
{
|
|
ucs4_t u;
|
|
if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
|
|
return 0xff3c;
|
|
else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
|
|
else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
|
|
else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
|
|
u |= 0x20000;
|
|
else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
return u;
|
|
}
|
|
|
|
static ucs4_t
|
|
jisx0213_2004_2_decoder(const unsigned char *data)
|
|
{
|
|
ucs4_t u;
|
|
TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
|
|
else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
|
|
u |= 0x20000;
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
return u;
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config)
|
|
{
|
|
DBCHAR coded;
|
|
|
|
switch (*length) {
|
|
case 1: /* first character */
|
|
if (*data >= 0x10000) {
|
|
if ((*data) >> 16 == 0x20000 >> 16) {
|
|
EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
|
|
else TRYMAP_ENC(jisx0213_emp, coded,
|
|
(*data) & 0xffff)
|
|
return coded;
|
|
}
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
|
|
else TRYMAP_ENC(jisx0213_bmp, coded, *data) {
|
|
if (coded == MULTIC)
|
|
return MAP_MULTIPLE_AVAIL;
|
|
}
|
|
else TRYMAP_ENC(jisxcommon, coded, *data) {
|
|
if (coded & 0x8000)
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
return coded;
|
|
case 2: /* second character of unicode pair */
|
|
coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
|
|
jisx0213_pair_encmap, JISX0213_ENCPAIRS);
|
|
if (coded == DBCINV) {
|
|
*length = 1;
|
|
coded = find_pairencmap((ucs2_t)data[0], 0,
|
|
jisx0213_pair_encmap, JISX0213_ENCPAIRS);
|
|
if (coded == DBCINV)
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
else
|
|
return coded;
|
|
case -1: /* flush unterminated */
|
|
*length = 1;
|
|
coded = find_pairencmap((ucs2_t)data[0], 0,
|
|
jisx0213_pair_encmap, JISX0213_ENCPAIRS);
|
|
if (coded == DBCINV)
|
|
return MAP_UNMAPPABLE;
|
|
else
|
|
return coded;
|
|
default:
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
|
|
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
|
|
return coded;
|
|
else if (coded & 0x8000)
|
|
return MAP_UNMAPPABLE;
|
|
else
|
|
return coded;
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded;
|
|
Py_ssize_t ilength = *length;
|
|
|
|
coded = jisx0213_encoder(data, length, (void *)2000);
|
|
switch (ilength) {
|
|
case 1:
|
|
if (coded == MAP_MULTIPLE_AVAIL)
|
|
return MAP_MULTIPLE_AVAIL;
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
case 2:
|
|
if (*length != 2)
|
|
return MAP_UNMAPPABLE;
|
|
else
|
|
return coded;
|
|
default:
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
|
|
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
|
|
return coded;
|
|
else if (coded & 0x8000)
|
|
return coded & 0x7fff;
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded = jisx0213_encoder(data, length, NULL);
|
|
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
|
|
return coded;
|
|
else if (coded & 0x8000)
|
|
return MAP_UNMAPPABLE;
|
|
else
|
|
return coded;
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded;
|
|
Py_ssize_t ilength = *length;
|
|
|
|
coded = jisx0213_encoder(data, length, NULL);
|
|
switch (ilength) {
|
|
case 1:
|
|
if (coded == MAP_MULTIPLE_AVAIL)
|
|
return MAP_MULTIPLE_AVAIL;
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
case 2:
|
|
if (*length != 2)
|
|
return MAP_UNMAPPABLE;
|
|
else
|
|
return coded;
|
|
default:
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded = jisx0213_encoder(data, length, NULL);
|
|
if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
|
|
return coded;
|
|
else if (coded & 0x8000)
|
|
return coded & 0x7fff;
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
static ucs4_t
|
|
jisx0201_r_decoder(const unsigned char *data)
|
|
{
|
|
ucs4_t u;
|
|
JISX0201_R_DECODE(*data, u)
|
|
else return MAP_UNMAPPABLE;
|
|
return u;
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded;
|
|
JISX0201_R_ENCODE(*data, coded)
|
|
else return MAP_UNMAPPABLE;
|
|
return coded;
|
|
}
|
|
|
|
static ucs4_t
|
|
jisx0201_k_decoder(const unsigned char *data)
|
|
{
|
|
ucs4_t u;
|
|
JISX0201_K_DECODE(*data ^ 0x80, u)
|
|
else return MAP_UNMAPPABLE;
|
|
return u;
|
|
}
|
|
|
|
static DBCHAR
|
|
jisx0201_k_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded;
|
|
JISX0201_K_ENCODE(*data, coded)
|
|
else return MAP_UNMAPPABLE;
|
|
return coded - 0x80;
|
|
}
|
|
|
|
static int
|
|
gb2312_init(void)
|
|
{
|
|
static int initialized = 0;
|
|
|
|
if (!initialized && (
|
|
IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
|
|
IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
|
|
return -1;
|
|
initialized = 1;
|
|
return 0;
|
|
}
|
|
|
|
static ucs4_t
|
|
gb2312_decoder(const unsigned char *data)
|
|
{
|
|
ucs4_t u;
|
|
TRYMAP_DEC(gb2312, u, data[0], data[1])
|
|
return u;
|
|
else
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
static DBCHAR
|
|
gb2312_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
DBCHAR coded;
|
|
assert(*length == 1);
|
|
if (*data < 0x10000) {
|
|
TRYMAP_ENC(gbcommon, coded, *data) {
|
|
if (!(coded & 0x8000))
|
|
return coded;
|
|
}
|
|
}
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
|
|
static ucs4_t
|
|
dummy_decoder(const unsigned char *data)
|
|
{
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
static DBCHAR
|
|
dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
|
|
{
|
|
return MAP_UNMAPPABLE;
|
|
}
|
|
|
|
/*-*- registry tables -*-*/
|
|
|
|
#define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \
|
|
ksx1001_init, \
|
|
ksx1001_decoder, ksx1001_encoder }
|
|
#define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \
|
|
ksx1001_init, \
|
|
ksx1001_decoder, ksx1001_encoder }
|
|
#define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \
|
|
NULL, \
|
|
jisx0201_r_decoder, jisx0201_r_encoder }
|
|
#define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \
|
|
NULL, \
|
|
jisx0201_k_decoder, jisx0201_k_encoder }
|
|
#define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \
|
|
jisx0208_init, \
|
|
jisx0208_decoder, jisx0208_encoder }
|
|
#define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \
|
|
jisx0208_init, \
|
|
jisx0208_decoder, jisx0208_encoder }
|
|
#define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \
|
|
jisx0212_init, \
|
|
jisx0212_decoder, jisx0212_encoder }
|
|
#define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \
|
|
jisx0213_init, \
|
|
jisx0213_2000_1_decoder, \
|
|
jisx0213_2000_1_encoder }
|
|
#define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
|
|
jisx0213_init, \
|
|
jisx0213_2000_1_decoder, \
|
|
jisx0213_2000_1_encoder_paironly }
|
|
#define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \
|
|
jisx0213_init, \
|
|
jisx0213_2000_2_decoder, \
|
|
jisx0213_2000_2_encoder }
|
|
#define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \
|
|
jisx0213_init, \
|
|
jisx0213_2004_1_decoder, \
|
|
jisx0213_2004_1_encoder }
|
|
#define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
|
|
jisx0213_init, \
|
|
jisx0213_2004_1_decoder, \
|
|
jisx0213_2004_1_encoder_paironly }
|
|
#define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \
|
|
jisx0213_init, \
|
|
jisx0213_2004_2_decoder, \
|
|
jisx0213_2004_2_encoder }
|
|
#define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \
|
|
gb2312_init, \
|
|
gb2312_decoder, gb2312_encoder }
|
|
#define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \
|
|
cns11643_init, \
|
|
cns11643_1_decoder, cns11643_1_encoder }
|
|
#define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \
|
|
cns11643_init, \
|
|
cns11643_2_decoder, cns11643_2_encoder }
|
|
#define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \
|
|
NULL, dummy_decoder, dummy_encoder }
|
|
#define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \
|
|
NULL, dummy_decoder, dummy_encoder }
|
|
#define REGISTRY_SENTINEL { 0, }
|
|
#define CONFIGDEF(var, attrs) \
|
|
static const struct iso2022_config iso2022_##var##_config = { \
|
|
attrs, iso2022_##var##_designations \
|
|
};
|
|
|
|
static const struct iso2022_designation iso2022_kr_designations[] = {
|
|
REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
|
|
};
|
|
CONFIGDEF(kr, 0)
|
|
|
|
static const struct iso2022_designation iso2022_jp_designations[] = {
|
|
REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
|
|
REGISTRY_SENTINEL
|
|
};
|
|
CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
|
|
|
|
static const struct iso2022_designation iso2022_jp_1_designations[] = {
|
|
REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
|
|
REGISTRY_JISX0208_O, REGISTRY_SENTINEL
|
|
};
|
|
CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
|
|
|
|
static const struct iso2022_designation iso2022_jp_2_designations[] = {
|
|
REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
|
|
REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
|
|
REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
|
|
};
|
|
CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
|
|
|
|
static const struct iso2022_designation iso2022_jp_2004_designations[] = {
|
|
REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
|
|
REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
|
|
};
|
|
CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
|
|
|
|
static const struct iso2022_designation iso2022_jp_3_designations[] = {
|
|
REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
|
|
REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
|
|
};
|
|
CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
|
|
|
|
static const struct iso2022_designation iso2022_jp_ext_designations[] = {
|
|
REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
|
|
REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
|
|
};
|
|
CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
|
|
|
|
|
|
BEGIN_MAPPINGS_LIST
|
|
/* no mapping table here */
|
|
END_MAPPINGS_LIST
|
|
|
|
#define ISO2022_CODEC(variation) { \
|
|
"iso2022_" #variation, \
|
|
&iso2022_##variation##_config, \
|
|
iso2022_codec_init, \
|
|
_STATEFUL_METHODS(iso2022) \
|
|
},
|
|
|
|
BEGIN_CODECS_LIST
|
|
ISO2022_CODEC(kr)
|
|
ISO2022_CODEC(jp)
|
|
ISO2022_CODEC(jp_1)
|
|
ISO2022_CODEC(jp_2)
|
|
ISO2022_CODEC(jp_2004)
|
|
ISO2022_CODEC(jp_3)
|
|
ISO2022_CODEC(jp_ext)
|
|
END_CODECS_LIST
|
|
|
|
I_AM_A_MODULE_FOR(iso2022)
|