// -*- coding: utf-8 -*- // Utility functions for strings. // // Copyright (C) 2007 Satoru Takabayashi // All rights reserved. This is free software with ABSOLUTELY NO WARRANTY. // You can redistribute it and/or modify it under the terms of // the GNU General Public License version 2. // NOTES: // // Surrogate pairs: // // 1st 0xD800 - 0xDBFF (high surrogate) // 2nd 0xDC00 - 0xDFFF (low surrogate) // // UTF-8 sequences: // // 0xxxxxxx // 110xxxxx 10xxxxxx // 1110xxxx 10xxxxxx 10xxxxxx // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx var EQUAL_SIGN = 0x3D; var QUESTION_MARK = 0x3F; // "あい" => [ 0x3042, 0x3044 ] function convertStringToUnicodeCodePoints(str) { var surrogate_1st = 0; var unicode_codes = []; for (var i = 0; i < str.length; ++i) { var utf16_code = str.charCodeAt(i); if (surrogate_1st != 0) { if (utf16_code >= 0xDC00 && utf16_code <= 0xDFFF) { var surrogate_2nd = utf16_code; var unicode_code = (surrogate_1st - 0xD800) * (1 << 10) + (1 << 16) + (surrogate_2nd - 0xDC00); unicode_codes.push(unicode_code); } else { // Malformed surrogate pair ignored. } surrogate_1st = 0; } else if (utf16_code >= 0xD800 && utf16_code <= 0xDBFF) { surrogate_1st = utf16_code; } else { unicode_codes.push(utf16_code); } } return unicode_codes; } // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => [ 0x3042, 0x3044 ] function convertUtf8BytesToUnicodeCodePoints(utf8_bytes) { var unicode_codes = []; var unicode_code = 0; var num_followed = 0; for (var i = 0; i < utf8_bytes.length; ++i) { var utf8_byte = utf8_bytes[i]; if (utf8_byte >= 0x100) { // Malformed utf8 byte ignored. } else if ((utf8_byte & 0xC0) == 0x80) { if (num_followed > 0) { unicode_code = (unicode_code << 6) | (utf8_byte & 0x3f); num_followed -= 1; } else { // Malformed UTF-8 sequence ignored. } } else { if (num_followed == 0) { unicode_codes.push(unicode_code); } else { // Malformed UTF-8 sequence ignored. } if (utf8_byte < 0x80){ // 1-byte unicode_code = utf8_byte; num_followed = 0; } else if ((utf8_byte & 0xE0) == 0xC0) { // 2-byte unicode_code = utf8_byte & 0x1f; num_followed = 1; } else if ((utf8_byte & 0xF0) == 0xE0) { // 3-byte unicode_code = utf8_byte & 0x0f; num_followed = 2; } else if ((utf8_byte & 0xF8) == 0xF0) { // 4-byte unicode_code = utf8_byte & 0x07; num_followed = 3; } else { // Malformed UTF-8 sequence ignored. } } } if (num_followed == 0) { unicode_codes.push(unicode_code); } else { // Malformed UTF-8 sequence ignored. } unicode_codes.shift(); // Trim the first element. return unicode_codes; } // Helper function. function convertEscapedCodesToCodes(str, prefix, base, num_bits) { var parts = str.split(prefix); parts.shift(); // Trim the first element. var codes = []; var max = Math.pow(2, num_bits); for (var i = 0; i < parts.length; ++i) { var code = parseInt(parts[i], base); if (code >= 0 && code < max) { codes.push(code); } else { // Malformed code ignored. } } return codes; } // r'\u3042\u3044' => [ 0x3042, 0x3044 ] // Note that the r '...' notation is borrowed from Python. function convertEscapedUtf16CodesToUtf16Codes(str) { return convertEscapedCodesToCodes(str, "\\u", 16, 16); } // r'\U00003042\U00003044' => [ 0x3042, 0x3044 ] function convertEscapedUtf32CodesToUnicodeCodePoints(str) { return convertEscapedCodesToCodes(str, "\\U", 16, 32); } // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ] function convertEscapedBytesToBytes(str, base) { var prefix = (base == 16 ? "\\x" : "\\"); return convertEscapedCodesToCodes(str, prefix, base, 8); } // "&#12354;&#12356;" => [ 0x3042, 0x3044 ] // "&#x3042;&#x3044;" => [ 0x3042, 0x3044 ] function convertNumRefToUnicodeCodePoints(str, base) { var num_refs = str.split(";"); num_refs.pop(); // Trim the last element. var unicode_codes = []; for (var i = 0; i < num_refs.length; ++i) { var decimal_str = num_refs[i].replace(/^&#x?/, ""); var unicode_code = parseInt(decimal_str, base); unicode_codes.push(unicode_code); } return unicode_codes; } // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ] // [ 0xD840, 0xDC0B ] => [ 0x2000B ] // A surrogate pair. function convertUnicodeCodePointsToUtf16Codes(unicode_codes) { var utf16_codes = []; for (var i = 0; i < unicode_codes.length; ++i) { var unicode_code = unicode_codes[i]; if (unicode_code < (1 << 16)) { utf16_codes.push(unicode_code); } else { var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800; var second = (unicode_code % (1 << 10)) + 0xDC00; utf16_codes.push(first) utf16_codes.push(second) } } return utf16_codes; } // 0x3042 => [ 0xE3, 0x81, 0x82 ] function convertUnicodeCodePointToUtf8Bytes(unicode_code, base) { var utf8_bytes = []; if (unicode_code < 0x80) { // 1-byte utf8_bytes.push(unicode_code); } else if (unicode_code < (1 << 11)) { // 2-byte utf8_bytes.push((unicode_code >>> 6) | 0xC0); utf8_bytes.push((unicode_code & 0x3F) | 0x80); } else if (unicode_code < (1 << 16)) { // 3-byte utf8_bytes.push((unicode_code >>> 12) | 0xE0); utf8_bytes.push(((unicode_code >> 6) & 0x3f) | 0x80); utf8_bytes.push((unicode_code & 0x3F) | 0x80); } else if (unicode_code < (1 << 21)) { // 4-byte utf8_bytes.push((unicode_code >>> 18) | 0xF0); utf8_bytes.push(((unicode_code >> 12) & 0x3F) | 0x80); utf8_bytes.push(((unicode_code >> 6) & 0x3F) | 0x80); utf8_bytes.push((unicode_code & 0x3F) | 0x80); } return utf8_bytes; } // [ 0x3042, 0x3044 ] => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] function convertUnicodeCodePointsToUtf8Bytes(unicode_codes) { var utf8_bytes = []; for (var i = 0; i < unicode_codes.length; ++i) { var bytes = convertUnicodeCodePointToUtf8Bytes(unicode_codes[i]); utf8_bytes = utf8_bytes.concat(bytes); } return utf8_bytes; } // 0xff => "ff" // 0xff => "377" function formatNumber(number, base, num_digits) { var str = number.toString(base).toUpperCase(); for (var i = str.length; i < num_digits; ++i) { str = "0" + str; } return str; } var BASE64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; function encodeBase64Helper(data) { var encoded = []; if (data.length == 1) { encoded.push(BASE64.charAt(data[0] >> 2)); encoded.push(BASE64.charAt(((data[0] & 3) << 4))); encoded.push('='); encoded.push('='); } else if (data.length == 2) { encoded.push(BASE64.charAt(data[0] >> 2)); encoded.push(BASE64.charAt(((data[0] & 3) << 4) | (data[1] >> 4))); encoded.push(BASE64.charAt(((data[1] & 0xF) << 2))); encoded.push('='); } else if (data.length == 3) { encoded.push(BASE64.charAt(data[0] >> 2)); encoded.push(BASE64.charAt(((data[0] & 3) << 4) | (data[1] >> 4))); encoded.push(BASE64.charAt(((data[1] & 0xF) << 2) | (data[2] >> 6))); encoded.push(BASE64.charAt(data[2] & 0x3f)); } return encoded.join(''); } // "44GC44GE" => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] function decodeBase64(encoded) { var decoded_bytes = []; var data_bytes = []; for (var i = 0; i < encoded.length; i += 4) { data_bytes.length = 0; for (var j = i; j < i + 4; ++j) { var letter = encoded.charAt(j); if (letter == "=" || letter == "") { break; } var data_byte = BASE64.indexOf(letter); if (data_byte >= 64) { // Malformed base64 data. break; } data_bytes.push(data_byte); } if (data_bytes.length == 1) { // Malformed base64 data. } else if (data_bytes.length == 2) { // 12-bit. decoded_bytes.push((data_bytes[0] << 2) | (data_bytes[1] >> 4)); } else if (data_bytes.length == 3) { // 18-bit. decoded_bytes.push((data_bytes[0] << 2) | (data_bytes[1] >> 4)); decoded_bytes.push(((data_bytes[1] & 0xF) << 4) | (data_bytes[2] >> 2)); } else if (data_bytes.length == 4) { // 24-bit. decoded_bytes.push((data_bytes[0] << 2) | (data_bytes[1] >> 4)); decoded_bytes.push(((data_bytes[1] & 0xF) << 4) | (data_bytes[2] >> 2)); decoded_bytes.push(((data_bytes[2] & 0x3) << 6) | (data_bytes[3])); } } return decoded_bytes; } // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => "44GC44GE" function encodeBase64(data_bytes) { var encoded = ''; for (var i = 0; i < data_bytes.length; i += 3) { var at_most_three_bytes = data_bytes.slice(i, i + 3); encoded += encodeBase64Helper(at_most_three_bytes); } return encoded; } function decodeQuotedPrintableHelper(str, prefix) { var decoded_bytes = []; for (var i = 0; i < str.length;) { if (str.charAt(i) == prefix) { decoded_bytes.push(parseInt(str.substr(i + 1, 2), 16)); i += 3; } else { decoded_bytes.push(str.charCodeAt(i)); ++i; } } return decoded_bytes; } // "=E3=81=82=E3=81=84" => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] function decodeQuotedPrintable(str) { str = str.replace(/_/g, " ") // RFC 2047. return decodeQuotedPrintableHelper(str, "="); } // "%E3%81%82%E3%81%84" => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] function decodeUrl(str) { return decodeQuotedPrintableHelper(str, "%"); } function encodeQuotedPrintableHelper(data_bytes, prefix, should_escape) { var encoded = ''; var prefix_code = prefix.charCodeAt(0); for (var i = 0; i < data_bytes.length; ++i) { var data_byte = data_bytes[i]; if (should_escape(data_byte)) { encoded += prefix + formatNumber(data_bytes[i], 16, 2); } else { encoded += String.fromCharCode(data_byte); } } return encoded; } // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => "=E3=81=82=E3=81=84" function encodeQuotedPrintable(data_bytes) { var should_escape = function(b) { return b < 32 || b > 126 || b == EQUAL_SIGN || b == QUESTION_MARK; }; return encodeQuotedPrintableHelper(data_bytes, '=', should_escape); } var URL_SAFE = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_.-"; // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => "%E3%81%82%E3%81%84" function encodeUrl(data_bytes) { var should_escape = function(b) { return URL_SAFE.indexOf(String.fromCharCode(b)) == -1; }; return encodeQuotedPrintableHelper(data_bytes, '%', should_escape); } // [ 0x3042, 0x3044 ] => "あい" function convertUtf16CodesToString(utf16_codes) { var unescaped = ''; for (var i = 0; i < utf16_codes.length; ++i) { unescaped += String.fromCharCode(utf16_codes[i]); } return unescaped; } // [ 0x3042, 0x3044 ] => "あい" function convertUnicodeCodePointsToString(unicode_codes) { var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes); return convertUtf16CodesToString(utf16_codes); } function maybeInitMaps(encoded_maps, to_unicode_map, from_unicode_map) { if (to_unicode_map.is_initialized) { return; } var data_types = [ 'ROUNDTRIP', 'INPUT_ONLY', 'OUTPUT_ONLY' ]; for (var i = 0; i < data_types.length; ++i) { var data_type = data_types[i]; var encoded_data = encoded_maps[data_type]; var data_bytes = decodeBase64(encoded_data); for (var j = 0; j < data_bytes.length; j += 4) { var local_code = (data_bytes[j] << 8) | data_bytes[j + 1]; var unicode_code = (data_bytes[j + 2] << 8) | data_bytes[j + 3]; if (i == 0 || i == 1) { // ROUNDTRIP or INPUT_ONLY to_unicode_map[local_code] = unicode_code; } if (i == 0 || i == 2) { // ROUNDTRIP or OUTPUT_ONLY from_unicode_map[unicode_code] = local_code; } } } to_unicode_map.is_initialized = true; } var SJIS_TO_UNICODE = {} var UNICODE_TO_SJIS = {} // Requires: sjis_map.js should be loaded. function maybeInitSjisMaps() { maybeInitMaps(SJIS_MAP_ENCODED, SJIS_TO_UNICODE, UNICODE_TO_SJIS); } var ISO88591_TO_UNICODE = {} var UNICODE_TO_ISO88591 = {} // Requires: iso88591_map.js should be loaded. function maybeInitIso88591Maps() { maybeInitMaps(ISO88591_MAP_ENCODED, ISO88591_TO_UNICODE, UNICODE_TO_ISO88591); } function lookupMapWithDefault(map, key, default_value) { var value = map[key]; if (!value) { value = default_value; } return value; } // [ 0x3042, 0x3044 ] => [ 0x82, 0xA0, 0x82, 0xA2 ] function convertUnicodeCodePointsToSjisBytes(unicode_codes) { maybeInitSjisMaps(); var sjis_bytes = []; for (var i = 0; i < unicode_codes.length; ++i) { var unicode_code = unicode_codes[i]; var sjis_code = lookupMapWithDefault(UNICODE_TO_SJIS, unicode_code, QUESTION_MARK); if (sjis_code <= 0xFF) { // 1-byte character. sjis_bytes.push(sjis_code); } else { sjis_bytes.push(sjis_code >> 8); sjis_bytes.push(sjis_code & 0xFF); } } return sjis_bytes; } // [ 0x3042, 0x3044 ] => [ 0xA4, 0xA2, 0xA4, 0xA4 ] function convertUnicodeCodePointsToEucJpBytes(unicode_codes) { maybeInitSjisMaps(); var eucjp_bytes = []; for (var i = 0; i < unicode_codes.length; ++i) { var unicode_code = unicode_codes[i]; var sjis_code = lookupMapWithDefault(UNICODE_TO_SJIS, unicode_code, QUESTION_MARK); if (sjis_code > 0xFF) { // Double byte character. var jis_code = convertSjisCodeToJisX208Code(sjis_code); var eucjp_code = jis_code | 0x8080; eucjp_bytes.push(eucjp_code >> 8); eucjp_bytes.push(eucjp_code & 0xFF); } else if (sjis_code >= 0x80) { // 8-bit character. eucjp_bytes.push(0x8E); eucjp_bytes.push(sjis_code); } else { // 7-bit character. eucjp_bytes.push(sjis_code); } } return eucjp_bytes; } function convertUnicodeCodePointsToIso88591Bytes(unicode_codes) { maybeInitIso88591Maps(); var latin_bytes = []; for (var i = 0; i < unicode_codes.length; ++i) { var unicode_code = unicode_codes[i]; var latin_code = lookupMapWithDefault(UNICODE_TO_ISO88591, unicode_code, QUESTION_MARK); latin_bytes.push(latin_code); } return latin_bytes; } // [ 0x82, 0xA0, 0x82, 0xA2 ] => [ 0x3042, 0x3044 ] function convertSjisBytesToUnicodeCodePoints(sjis_bytes) { maybeInitSjisMaps(); var unicode_codes = []; for (var i = 0; i < sjis_bytes.length;) { var sjis_code = -1; var sjis_byte = sjis_bytes[i]; if ((sjis_byte >= 0x81 && sjis_byte <= 0x9F) || (sjis_byte >= 0xE0 && sjis_byte <= 0xFC)) { ++i; var sjis_byte2 = sjis_bytes[i]; if ((sjis_byte2 >= 0x40 && sjis_byte2 <= 0x7E) || (sjis_byte2 >= 0x80 && sjis_byte2 <= 0xFC)) { sjis_code = (sjis_byte << 8) | sjis_byte2; ++i; } } else { sjis_code = sjis_byte; ++i; } var unicode_code = lookupMapWithDefault(SJIS_TO_UNICODE, sjis_code, QUESTION_MARK); unicode_codes.push(unicode_code); } return unicode_codes; } function convertIso88591BytesToUnicodeCodePoints(latin_bytes) { maybeInitIso88591Maps(); var unicode_codes = []; for (var i = 0; i < latin_bytes.length; ++i) { var latin_code = latin_bytes[i]; var unicode_code = lookupMapWithDefault(ISO88591_TO_UNICODE, latin_code, QUESTION_MARK); unicode_codes.push(unicode_code); } return unicode_codes; } // 0x2422 => 0x82a0 function convertJisX208CodeToSjisCode(jis_code) { var j1 = jis_code >> 8; var j2 = jis_code & 0xFF; // http://people.debian.org/~kubota/unicode-symbols-map2.html.ja var s1 = ((j1 - 1) >> 1) + ((j1 <= 0x5E) ? 0x71 : 0xB1); var s2 = j2 + ((j1 & 1) ? ((j2 < 0x60) ? 0x1F : 0x20) : 0x7E); return (s1 << 8) | s2; } // 0x82a0 => 0x2422 function convertSjisCodeToJisX208Code(sjis_code) { var s1 = sjis_code >> 8; var s2 = sjis_code & 0xFF; // http://people.debian.org/~kubota/unicode-symbols-map2.html.ja var j1 = (s1 << 1) - (s1 <= 0x9f ? 0xe0 : 0x160) - (s2 < 0x9f ? 1 : 0); var j2 = s2 - 0x1f - (s2 >= 0x7f ? 1 : 0) - (s2 >= 0x9f ? 0x5e : 0); return (j1 << 8) | j2; } // [ 0x24, 0x22, 0x24, 0x24 ] => [ 0x82, 0xA0, 0x82, 0xA2 ] function convertJisX208BytesToSjisBytes(jis_bytes) { var sjis_bytes = []; for (var i = 0; i < jis_bytes.length; i += 2) { var jis_code = (jis_bytes[i] << 8) | jis_bytes[i + 1]; var sjis_code = convertJisX208CodeToSjisCode(jis_code); sjis_bytes.push(sjis_code >> 8); sjis_bytes.push(sjis_code & 0xFF); } return sjis_bytes; } // [ 0x82, 0xA0, 0x82, 0xA2 ] => [ 0x24, 0x22, 0x24, 0x24 ] function convertSjisBytesToJisX208Bytes(sjis_bytes) { var jis_bytes = []; for (var i = 0; i < sjis_bytes.length; i += 2) { var sjis_code = (sjis_bytes[i] << 8) | sjis_bytes[i + 1]; var jis_code = convertSjisCodeToJisX208Code(sjis_code); jis_bytes.push(jis_code >> 8); jis_bytes.push(jis_code & 0xFF); } return jis_bytes; } // Constants used in convertJisBytesToUnicodeCodePoints(). var ASCII = 0; var JISX201 = 1; var JISX208 = 2; // Map used in convertIso2022JpBytesToUnicodeCodePoints(). var ESCAPE_SEQUENCE_TO_MODE = { "(B": ASCII, "(J": JISX201, "$B": JISX208, "$@": JISX208 }; // Map used in convertUnicodeCodePointsToIso2022JpBytes(). var MODE_TO_ESCAPE_SEQUENCE = {} MODE_TO_ESCAPE_SEQUENCE[ASCII] = "(B"; MODE_TO_ESCAPE_SEQUENCE[JISX201] = "(J"; MODE_TO_ESCAPE_SEQUENCE[JISX208] = "$B"; // [ 0x1B, 0x24, 0x42, 0x24, 0x22, 0x1B, 0x28, 0x42, ] => [ 0x3042 ] function convertIso2022JpBytesToUnicodeCodePoints(iso2022jp_bytes) { maybeInitSjisMaps(); var flush = function(mode, data_bytes, output) { var unicode_codes = []; if (mode == ASCII) { unicode_codes = data_bytes; } else if (mode == JISX201) { // Might have half-width Katakana? unicode_codes = convertSjisBytesToUnicodeCodePoints(data_bytes); } else if (mode == JISX208) { var sjis_bytes = convertJisX208BytesToSjisBytes(data_bytes); unicode_codes = convertSjisBytesToUnicodeCodePoints(sjis_bytes); } else { // Unknown mode } for (var i = 0; i < unicode_codes.length; ++i) { output.push(unicode_codes[i]); } data_bytes.length = 0; // Clear. } var unicode_codes = []; var mode = ASCII; var current_data_bytes = []; for (var i = 0; i < iso2022jp_bytes.length;) { if (iso2022jp_bytes[i] == 0x1B) { // Mode is changed. flush(mode, current_data_bytes, unicode_codes); ++i; var code = String.fromCharCode(iso2022jp_bytes[i], iso2022jp_bytes[i + 1]); mode = ESCAPE_SEQUENCE_TO_MODE[code]; if (!mode) { // Unknown mode. mode = ASCII; } i += 2; } else { current_data_bytes.push(iso2022jp_bytes[i]); ++i; } } flush(mode, current_data_bytes, unicode_codes); return unicode_codes; } // [ 0xA4, 0xA2, 0xA4, 0xA4 ] => [ 0x3042, 0x3044 ] function convertEucJpBytesToUnicodeCodePoints(eucjp_bytes) { maybeInitSjisMaps(); var unicode_codes = []; for (var i = 0; i < eucjp_bytes.length;) { if (eucjp_bytes[i] >= 0x80 && (i + 1) < eucjp_bytes.length && eucjp_bytes[i + 1] >= 0x80) { var eucjp_code = (eucjp_bytes[i] << 8) | eucjp_bytes[i + 1]; var jis_code = eucjp_code & 0x7F7F; var sjis_code = convertJisX208CodeToSjisCode(jis_code); var unicode_code = lookupMapWithDefault(SJIS_TO_UNICODE, sjis_code, QUESTION_MARK); unicode_codes.push(unicode_code); i += 2; } else { if (eucjp_bytes[i] < 0x80) { unicode_codes.push(eucjp_bytes[i]); } else { // Ignore singleton 8-bit byte. } ++i; } } return unicode_codes; } // [ 0x3042 ] => [ 0x1B, 0x24, 0x42, 0x24, 0x22, 0x1B, 0x28, 0x42, ] function convertUnicodeCodePointsToIso2022JpBytes(unicode_codes) { maybeInitSjisMaps(); var mode = ASCII; var maybeChangeMode = function(new_mode) { if (mode != new_mode) { mode = new_mode; var esc_as_string = MODE_TO_ESCAPE_SEQUENCE[mode]; var esc_as_code_points = convertStringToUnicodeCodePoints(esc_as_string); iso2022jp_bytes.push(0x1B); // ESC code. iso2022jp_bytes = iso2022jp_bytes.concat(esc_as_code_points); } } var iso2022jp_bytes = []; for (var i = 0; i < unicode_codes.length; ++i) { var unicode_code = unicode_codes[i]; var sjis_code = lookupMapWithDefault(UNICODE_TO_SJIS, unicode_code, QUESTION_MARK); if (sjis_code > 0xFF) { // Double byte character. var jis_code = convertSjisCodeToJisX208Code(sjis_code); maybeChangeMode(JISX208); iso2022jp_bytes.push(jis_code >> 8); iso2022jp_bytes.push(jis_code & 0xFF); } else if (sjis_code >= 0x80) { // 8-bit character. maybeChangeMode(JISX201); iso2022jp_bytes.push(sjis_code); } else { // 7-bit character. maybeChangeMode(ASCII); iso2022jp_bytes.push(sjis_code); } } maybeChangeMode(ASCII); return iso2022jp_bytes; } var MIME_FULL_MATCH = /^=\?([^?]+)\?([BQ])\?([^?]+)\?=$/; var MIME_PARTIAL_MATCH = /^=\?([^?]+)\?([BQ])\?([^?]+)\?=/; // "=?UTF-8?B?44GC?=" => true // "foobar" => false function isMimeEncodedString(str) { return str.match(MIME_FULL_MATCH) != null; } // "=?UTF-8?B?44GC?=" => ["UTF-8", [0xE3, 0x81, 0x82]] // "=?UTF-8?Q?=E3=81=82?=" => ["UTF-8", [0xE3, 0x81, 0x82]] // "INVALID" => [] function decodeMime(str) { var m = str.match(MIME_FULL_MATCH); if (m) { var char_encoding = m[1]; // We don't need the language information preceded by '*'. char_encoding = char_encoding.replace(/\*.*$/, "") var mime_encoding = m[2]; var mime_str = m[3]; var decoded_bytes; if (mime_encoding == "B") { decoded_bytes = decodeBase64(mime_str); } else if (mime_encoding == "Q") { decoded_bytes = decodeQuotedPrintable(mime_str); } if (char_encoding != "" && decoded_bytes) { return [char_encoding, decoded_bytes] } } return []; } var OUTPUT_CONVERTERS = { 'ISO2022JP': convertUnicodeCodePointsToIso2022JpBytes, 'ISO88591': convertUnicodeCodePointsToIso88591Bytes, 'SHIFTJIS': convertUnicodeCodePointsToSjisBytes, 'EUCJP': convertUnicodeCodePointsToEucJpBytes, 'UTF8': convertUnicodeCodePointsToUtf8Bytes } var INPUT_CONVERTERS = { 'ISO2022JP': convertIso2022JpBytesToUnicodeCodePoints, 'ISO88591': convertIso88591BytesToUnicodeCodePoints, 'SHIFTJIS': convertSjisBytesToUnicodeCodePoints, 'EUCJP': convertEucJpBytesToUnicodeCodePoints, 'UTF8': convertUtf8BytesToUnicodeCodePoints } function convertUnicodeCodePointsToBytes(unicode_codes, encoding) { var normalized_encoding = normalizeEncodingName(encoding); var convert_function = OUTPUT_CONVERTERS[normalized_encoding]; if (convert_function) { return convert_function(unicode_codes); } return []; } function convertBytesToUnicodeCodePoints(data_bytes, encoding) { var normalized_encoding = normalizeEncodingName(encoding); var convert_function = INPUT_CONVERTERS[normalized_encoding]; if (convert_function) { return convert_function(data_bytes); } return []; } // 'あい' => r'\u3042\u3044' function escapeToUtf16(str) { var escaped = '' for (var i = 0; i < str.length; ++i) { var hex = str.charCodeAt(i).toString(16).toUpperCase(); escaped += "\\u" + "0000".substr(hex.length) + hex; } return escaped; } // 'あい' => r'\U00003042\U00003044' function escapeToUtf32(str) { var escaped = '' var unicode_codes = convertStringToUnicodeCodePoints(str); for (var i = 0; i "あい" // "あい" => "あい" function escapeToNumRef(str, base) { var unicode_codes = convertStringToUnicodeCodePoints(str); var escaped = '' var prefix = base == 10 ? '' : 'x'; for (var i = 0; i < unicode_codes.length; ++i) { var code = unicode_codes[i].toString(base).toUpperCase(); var num_ref = "&#" + prefix + code + ";" escaped += num_ref; } return escaped; } // "あい" => "l8je" function escapeToPunyCode(str) { var unicode_codes = convertStringToPunyCodes(str); return convertUnicodeCodePointsToString(unicode_codes); } // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => '\xE3\x81\x82\xE3\x81\x84' // [ 0343, 0201, 0202, 0343, 0201, 0204 ] => '\343\201\202\343\201\204' function convertBytesToEscapedString(data_bytes, base) { var escaped = ''; for (var i = 0; i < data_bytes.length; ++i) { var prefix = (base == 16 ? "\\x" : "\\"); var num_digits = base == 16 ? 2 : 3; var escaped_byte = prefix + formatNumber(data_bytes[i], base, num_digits) escaped += escaped_byte; } return escaped; } // "あい" => [0x6C, 0x38, 0x6A, 0x65] // "l8je" // Requires: punycode.js should be loaded. function convertStringToPunyCodes(str) { var unicode_codes = convertStringToUnicodeCodePoints(str); var puny_codes = []; var result = ""; if (PunyCode.encode(unicode_codes, puny_codes)) { return puny_codes; } return unicode_codes; } // [ 0x6C, 0x38, 0x6A, 0x65 ] => "あい" // Requires: punycode.js should be loaded. function convertPunyCodesToString(puny_codes) { var unicode_codes = []; if (PunyCode.decode(puny_codes, unicode_codes)) { return convertUnicodeCodePointsToString(unicode_codes); } return convertUnicodeCodePointsToString(puny_codes); } // "あい" => r'\xE3\x81\x82\xE3\x81\x84' // UTF-8 // "あい" => r'\343\201\202\343\201\204' // UTF-8 function escapeToEscapedBytes(str, base, encoding) { var unicode_codes = convertStringToUnicodeCodePoints(str); var data_bytes = convertUnicodeCodePointsToBytes(unicode_codes, encoding); return convertBytesToEscapedString(data_bytes, base); } // "あい" => "44GC44GE" // UTF-8 function escapeToBase64(str, encoding) { var unicode_codes = convertStringToUnicodeCodePoints(str); var data_bytes = convertUnicodeCodePointsToBytes(unicode_codes, encoding); return encodeBase64(data_bytes); } // "あい" => "=E3=81=82=E3=81=84" // UTF-8 function escapeToQuotedPrintable(str, encoding) { var unicode_codes = convertStringToUnicodeCodePoints(str); var data_bytes = convertUnicodeCodePointsToBytes(unicode_codes, encoding); return encodeQuotedPrintable(data_bytes); } // "あい" => "%E3%81%82%E3%81%84" function escapeToUrl(str, encoding) { var unicode_codes = convertStringToUnicodeCodePoints(str); var data_bytes = convertUnicodeCodePointsToBytes(unicode_codes, encoding); return encodeUrl(data_bytes); } // "あい" => "=?UTF-8?B?44GC44GE?=" // "あい" => "=?UTF-8?Q?=E3=81=82=E3=81=84?=" function escapeToMime(str, mime_encoding, char_encoding) { var unicode_codes = convertStringToUnicodeCodePoints(str); var data_bytes = convertUnicodeCodePointsToBytes(unicode_codes, char_encoding); if (str == "") { return ""; } var escaped = "=?" + char_encoding + "?"; if (mime_encoding == 'base64') { escaped += "B?"; escaped += encodeBase64(data_bytes); } else { escaped += "Q?"; escaped += encodeQuotedPrintable(data_bytes); } escaped += '?='; return escaped; } // r'\u3042\u3044 => "あい" function unescapeFromUtf16(str) { var utf16_codes = convertEscapedUtf16CodesToUtf16Codes(str); return convertUtf16CodesToString(utf16_codes); } // r'\U00003042\U00003044 => "あい" function unescapeFromUtf32(str) { var unicode_codes = convertEscapedUtf32CodesToUnicodeCodePoints(str); var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes); return convertUtf16CodesToString(utf16_codes); } // r'\xE3\x81\x82\xE3\x81\x84' => "あい" // r'\343\201\202\343\201\204' => "あい" function unescapeFromEscapedBytes(str, base, encoding) { var data_bytes = convertEscapedBytesToBytes(str, base); var unicode_codes = convertBytesToUnicodeCodePoints(data_bytes, encoding); return convertUnicodeCodePointsToString(unicode_codes); } // "あい" => "あい" // "あい" => "あい" function unescapeFromNumRef(str, base) { var unicode_codes = convertNumRefToUnicodeCodePoints(str, base); return convertUnicodeCodePointsToString(unicode_codes); } // "l8je" => "あい" function unescapeFromPunyCode(str) { var unicode_codes = convertStringToUnicodeCodePoints(str); return convertPunyCodesToString(unicode_codes); } // "44GC44GE" => "あい" function unescapeFromBase64(str, encoding) { var decoded_bytes = decodeBase64(str); var unicode_codes = convertBytesToUnicodeCodePoints(decoded_bytes, encoding); return convertUnicodeCodePointsToString(unicode_codes); } // "=E3=81=82=E3=81=84" => "あい" function unescapeFromQuotedPrintable(str, encoding) { var decoded_bytes = decodeQuotedPrintable(str); var unicode_bytes = convertBytesToUnicodeCodePoints(decoded_bytes, encoding); return convertUnicodeCodePointsToString(unicode_bytes); } // "%E3%81%82%E3%81%84" => "あい" function unescapeFromUrl(str, encoding) { var decoded_bytes = decodeUrl(str); var unicode_bytes = convertBytesToUnicodeCodePoints(decoded_bytes, encoding); return convertUnicodeCodePointsToString(unicode_bytes); } // " " => true // " \n" => true function isEmptyOrSequenceOfWhiteSpaces(str) { for (var i = 0; i < str.length; ++i) { var code = str.charCodeAt(i); if (!(code == 0x09 || // TAB code == 0x0A || // LF code == 0x0D || // CR code == 0x20)) { // SPACE return false; } } return true; } // "=?UTF-8?B?*?= =?UTF-8?B?*?=" => ["=?UTF-8?B?*?=", "=?UTF-8?B?*?="] // "=?UTF-8?B?*?=FOO" => ["=?UTF-8?B?*?=", "FOO"] function splitMimeString(str) { var parts = []; var current = ""; while (str != "") { var m = str.match(MIME_PARTIAL_MATCH) if (m) { if (!isEmptyOrSequenceOfWhiteSpaces(current)) { parts.push(current); } current = ""; parts.push(m[0]); str = str.substr(m[0].length); } else { current += str.charAt(0); str = str.substr(1); } } if (!isEmptyOrSequenceOfWhiteSpaces(current)) { parts.push(current); } return parts; } // "UTF-8" => "UTF8" // "Shift_JIS" => "SHIFTJIS" function normalizeEncodingName(encoding) { return encoding.toUpperCase().replace(/[_-]/g, ""); } // "=?UTF-8?B?44GC44GE?=" => "あい" // "=?Shift_JIS?B?gqCCog==?=" => "あい" // "=?ISO-2022-JP?B?GyRCJCIkJBsoQg==?=" => "あい" // "=?UTF-8?Q?=E3=81=82=E3=81=84?=" => "あい" // "=?Shift_JIS?Q?=82=A0=82=A2?=" => "あい" // "=?ISO-2022-JP?Q?=1B$B$"$$=1B(B?=" => "あい" function unescapeFromMime(str) { var parts = splitMimeString(str); var unescaped = ""; for (var i = 0; i < parts.length; ++i) { if (isMimeEncodedString(parts[i])) { var pair = decodeMime(parts[i]); if (pair.length == 0) { // Malformed MIME string. Skip it. continue; } var encoding = normalizeEncodingName(pair[0]); var data_bytes = pair[1]; var unicode_codes = convertBytesToUnicodeCodePoints(data_bytes, encoding); unescaped += convertUnicodeCodePointsToString(unicode_codes); } else { unescaped += parts[i]; } } return unescaped; }