Edit File by line

#ifndef Py_UNICODEOBJECT_H

[0] Fix | Delete

#define Py_UNICODEOBJECT_H

[1] Fix | Delete

[2] Fix | Delete

#include <stdarg.h>

[3] Fix | Delete

[4] Fix | Delete

[5] Fix | Delete

[6] Fix | Delete

Unicode implementation based on original code by Fredrik Lundh,

[7] Fix | Delete

modified by Marc-Andre Lemburg (mal@lemburg.com) according to the

[8] Fix | Delete

Unicode Integration Proposal. (See

[9] Fix | Delete

http://www.egenix.com/files/python/unicode-proposal.txt).

[10] Fix | Delete

[11] Fix | Delete

[12] Fix | Delete

[13] Fix | Delete

[14] Fix | Delete

Original header:

[15] Fix | Delete

--------------------------------------------------------------------

[16] Fix | Delete

[17] Fix | Delete

* Yet another Unicode string type for Python. This type supports the

[18] Fix | Delete

* 16-bit Basic Multilingual Plane (BMP) only.

[19] Fix | Delete

[20] Fix | Delete

* Written by Fredrik Lundh, January 1999.

[21] Fix | Delete

[22] Fix | Delete

[23] Fix | Delete

[24] Fix | Delete

[25] Fix | Delete

* fredrik@pythonware.com

[26] Fix | Delete

* http://www.pythonware.com

[27] Fix | Delete

[28] Fix | Delete

* --------------------------------------------------------------------

[29] Fix | Delete

* This Unicode String Type is

[30] Fix | Delete

[31] Fix | Delete

[32] Fix | Delete

[33] Fix | Delete

[34] Fix | Delete

* By obtaining, using, and/or copying this software and/or its

[35] Fix | Delete

* associated documentation, you agree that you have read, understood,

[36] Fix | Delete

* and will comply with the following terms and conditions:

[37] Fix | Delete

[38] Fix | Delete

* Permission to use, copy, modify, and distribute this software and its

[39] Fix | Delete

* associated documentation for any purpose and without fee is hereby

[40] Fix | Delete

* granted, provided that the above copyright notice appears in all

[41] Fix | Delete

* copies, and that both that copyright notice and this permission notice

[42] Fix | Delete

* appear in supporting documentation, and that the name of Secret Labs

[43] Fix | Delete

* AB or the author not be used in advertising or publicity pertaining to

[44] Fix | Delete

* distribution of the software without specific, written prior

[45] Fix | Delete

* permission.

[46] Fix | Delete

[47] Fix | Delete

* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO

[48] Fix | Delete

* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND

[49] Fix | Delete

* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR

[50] Fix | Delete

* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

[51] Fix | Delete

* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN

[52] Fix | Delete

* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT

[53] Fix | Delete

* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

[54] Fix | Delete

* -------------------------------------------------------------------- */

[55] Fix | Delete

[56] Fix | Delete

#include <ctype.h>

[57] Fix | Delete

[58] Fix | Delete

/* === Internal API ======================================================= */

[59] Fix | Delete

[60] Fix | Delete

/* --- Internal Unicode Format -------------------------------------------- */

[61] Fix | Delete

[62] Fix | Delete

/* Python 3.x requires unicode */

[63] Fix | Delete

#define Py_USING_UNICODE

[64] Fix | Delete

[65] Fix | Delete

#ifndef SIZEOF_WCHAR_T

[66] Fix | Delete

#error Must define SIZEOF_WCHAR_T

[67] Fix | Delete

#endif

[68] Fix | Delete

[69] Fix | Delete

#define Py_UNICODE_SIZE SIZEOF_WCHAR_T

[70] Fix | Delete

[71] Fix | Delete

/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.

[72] Fix | Delete

Otherwise, Unicode strings are stored as UCS-2 (with limited support

[73] Fix | Delete

for UTF-16) */

[74] Fix | Delete

[75] Fix | Delete

#if Py_UNICODE_SIZE >= 4

[76] Fix | Delete

#define Py_UNICODE_WIDE

[77] Fix | Delete

#endif

[78] Fix | Delete

[79] Fix | Delete

/* Set these flags if the platform has "wchar.h" and the

[80] Fix | Delete

wchar_t type is a 16-bit unsigned type */

[81] Fix | Delete

/* #define HAVE_WCHAR_H */

[82] Fix | Delete

/* #define HAVE_USABLE_WCHAR_T */

[83] Fix | Delete

[84] Fix | Delete

/* Py_UNICODE was the native Unicode storage format (code unit) used by

[85] Fix | Delete

Python and represents a single Unicode element in the Unicode type.

[86] Fix | Delete

With PEP 393, Py_UNICODE is deprecated and replaced with a

[87] Fix | Delete

typedef to wchar_t. */

[88] Fix | Delete

[89] Fix | Delete

#ifndef Py_LIMITED_API

[90] Fix | Delete

#define PY_UNICODE_TYPE wchar_t

[91] Fix | Delete

typedef wchar_t Py_UNICODE;

[92] Fix | Delete

#endif

[93] Fix | Delete

[94] Fix | Delete

/* If the compiler provides a wchar_t type we try to support it

[95] Fix | Delete

through the interface functions PyUnicode_FromWideChar(),

[96] Fix | Delete

PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */

[97] Fix | Delete

[98] Fix | Delete

#ifdef HAVE_USABLE_WCHAR_T

[99] Fix | Delete

# ifndef HAVE_WCHAR_H

[100] Fix | Delete

# define HAVE_WCHAR_H

[101] Fix | Delete

# endif

[102] Fix | Delete

#endif

[103] Fix | Delete

[104] Fix | Delete

#ifdef HAVE_WCHAR_H

[105] Fix | Delete

/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */

[106] Fix | Delete

# ifdef _HAVE_BSDI

[107] Fix | Delete

# include <time.h>

[108] Fix | Delete

# endif

[109] Fix | Delete

# include <wchar.h>

[110] Fix | Delete

#endif

[111] Fix | Delete

[112] Fix | Delete

/* Py_UCS4 and Py_UCS2 are typedefs for the respective

[113] Fix | Delete

unicode representations. */

[114] Fix | Delete

typedef uint32_t Py_UCS4;

[115] Fix | Delete

typedef uint16_t Py_UCS2;

[116] Fix | Delete

typedef uint8_t Py_UCS1;

[117] Fix | Delete

[118] Fix | Delete

/* --- Internal Unicode Operations ---------------------------------------- */

[119] Fix | Delete

[120] Fix | Delete

/* Since splitting on whitespace is an important use case, and

[121] Fix | Delete

whitespace in most situations is solely ASCII whitespace, we

[122] Fix | Delete

optimize for the common case by using a quick look-up table

[123] Fix | Delete

_Py_ascii_whitespace (see below) with an inlined check.

[124] Fix | Delete

[125] Fix | Delete

[126] Fix | Delete

#ifndef Py_LIMITED_API

[127] Fix | Delete

#define Py_UNICODE_ISSPACE(ch) \

[128] Fix | Delete

((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))

[129] Fix | Delete

[130] Fix | Delete

#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)

[131] Fix | Delete

#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)

[132] Fix | Delete

#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)

[133] Fix | Delete

#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)

[134] Fix | Delete

[135] Fix | Delete

#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)

[136] Fix | Delete

#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)

[137] Fix | Delete

#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)

[138] Fix | Delete

[139] Fix | Delete

#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)

[140] Fix | Delete

#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)

[141] Fix | Delete

#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)

[142] Fix | Delete

#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)

[143] Fix | Delete

[144] Fix | Delete

#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)

[145] Fix | Delete

#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)

[146] Fix | Delete

#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)

[147] Fix | Delete

[148] Fix | Delete

#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)

[149] Fix | Delete

[150] Fix | Delete

#define Py_UNICODE_ISALNUM(ch) \

[151] Fix | Delete

(Py_UNICODE_ISALPHA(ch) || \

[152] Fix | Delete

Py_UNICODE_ISDECIMAL(ch) || \

[153] Fix | Delete

Py_UNICODE_ISDIGIT(ch) || \

[154] Fix | Delete

Py_UNICODE_ISNUMERIC(ch))

[155] Fix | Delete

[156] Fix | Delete

#define Py_UNICODE_COPY(target, source, length) \

[157] Fix | Delete

memcpy((target), (source), (length)*sizeof(Py_UNICODE))

[158] Fix | Delete

[159] Fix | Delete

#define Py_UNICODE_FILL(target, value, length) \

[160] Fix | Delete

do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\

[161] Fix | Delete

for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\

[162] Fix | Delete

} while (0)

[163] Fix | Delete

[164] Fix | Delete

/* macros to work with surrogates */

[165] Fix | Delete

#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)

[166] Fix | Delete

#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)

[167] Fix | Delete

#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)

[168] Fix | Delete

/* Join two surrogate characters and return a single Py_UCS4 value. */

[169] Fix | Delete

#define Py_UNICODE_JOIN_SURROGATES(high, low) \

[170] Fix | Delete

(((((Py_UCS4)(high) & 0x03FF) << 10) | \

[171] Fix | Delete

((Py_UCS4)(low) & 0x03FF)) + 0x10000)

[172] Fix | Delete

/* high surrogate = top 10 bits added to D800 */

[173] Fix | Delete

#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))

[174] Fix | Delete

/* low surrogate = bottom 10 bits added to DC00 */

[175] Fix | Delete

#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))

[176] Fix | Delete

[177] Fix | Delete

/* Check if substring matches at given offset. The offset must be

[178] Fix | Delete

valid, and the substring must not be empty. */

[179] Fix | Delete

[180] Fix | Delete

#define Py_UNICODE_MATCH(string, offset, substring) \

[181] Fix | Delete

((*((string)->wstr + (offset)) == *((substring)->wstr)) && \

[182] Fix | Delete

((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \

[183] Fix | Delete

!memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))

[184] Fix | Delete

[185] Fix | Delete

#endif /* Py_LIMITED_API */

[186] Fix | Delete

[187] Fix | Delete

#ifdef __cplusplus

[188] Fix | Delete

extern "C" {

[189] Fix | Delete

#endif

[190] Fix | Delete

[191] Fix | Delete

/* --- Unicode Type ------------------------------------------------------- */

[192] Fix | Delete

[193] Fix | Delete

#ifndef Py_LIMITED_API

[194] Fix | Delete

[195] Fix | Delete

/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject

[196] Fix | Delete

structure. state.ascii and state.compact are set, and the data

[197] Fix | Delete

immediately follow the structure. utf8_length and wstr_length can be found

[198] Fix | Delete

in the length field; the utf8 pointer is equal to the data pointer. */

[199] Fix | Delete

typedef struct {

[200] Fix | Delete

/* There are 4 forms of Unicode strings:

[201] Fix | Delete

[202] Fix | Delete

- compact ascii:

[203] Fix | Delete

[204] Fix | Delete

* structure = PyASCIIObject

[205] Fix | Delete

* test: PyUnicode_IS_COMPACT_ASCII(op)

[206] Fix | Delete

* kind = PyUnicode_1BYTE_KIND

[207] Fix | Delete

* compact = 1

[208] Fix | Delete

* ascii = 1

[209] Fix | Delete

* ready = 1

[210] Fix | Delete

* (length is the length of the utf8 and wstr strings)

[211] Fix | Delete

* (data starts just after the structure)

[212] Fix | Delete

* (since ASCII is decoded from UTF-8, the utf8 string are the data)

[213] Fix | Delete

[214] Fix | Delete

- compact:

[215] Fix | Delete

[216] Fix | Delete

* structure = PyCompactUnicodeObject

[217] Fix | Delete

* test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)

[218] Fix | Delete

* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or

[219] Fix | Delete

PyUnicode_4BYTE_KIND

[220] Fix | Delete

* compact = 1

[221] Fix | Delete

* ready = 1

[222] Fix | Delete

* ascii = 0

[223] Fix | Delete

* utf8 is not shared with data

[224] Fix | Delete

* utf8_length = 0 if utf8 is NULL

[225] Fix | Delete

* wstr is shared with data and wstr_length=length

[226] Fix | Delete

if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2

[227] Fix | Delete

or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4

[228] Fix | Delete

* wstr_length = 0 if wstr is NULL

[229] Fix | Delete

* (data starts just after the structure)

[230] Fix | Delete

[231] Fix | Delete

- legacy string, not ready:

[232] Fix | Delete

[233] Fix | Delete

* structure = PyUnicodeObject

[234] Fix | Delete

* test: kind == PyUnicode_WCHAR_KIND

[235] Fix | Delete

* length = 0 (use wstr_length)

[236] Fix | Delete

* hash = -1

[237] Fix | Delete

* kind = PyUnicode_WCHAR_KIND

[238] Fix | Delete

* compact = 0

[239] Fix | Delete

* ascii = 0

[240] Fix | Delete

* ready = 0

[241] Fix | Delete

* interned = SSTATE_NOT_INTERNED

[242] Fix | Delete

* wstr is not NULL

[243] Fix | Delete

* data.any is NULL

[244] Fix | Delete

* utf8 is NULL

[245] Fix | Delete

* utf8_length = 0

[246] Fix | Delete

[247] Fix | Delete

- legacy string, ready:

[248] Fix | Delete

[249] Fix | Delete

* structure = PyUnicodeObject structure

[250] Fix | Delete

* test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND

[251] Fix | Delete

* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or

[252] Fix | Delete

PyUnicode_4BYTE_KIND

[253] Fix | Delete

* compact = 0

[254] Fix | Delete

* ready = 1

[255] Fix | Delete

* data.any is not NULL

[256] Fix | Delete

* utf8 is shared and utf8_length = length with data.any if ascii = 1

[257] Fix | Delete

* utf8_length = 0 if utf8 is NULL

[258] Fix | Delete

* wstr is shared with data.any and wstr_length = length

[259] Fix | Delete

if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2

[260] Fix | Delete

or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4

[261] Fix | Delete

* wstr_length = 0 if wstr is NULL

[262] Fix | Delete

[263] Fix | Delete

Compact strings use only one memory block (structure + characters),

[264] Fix | Delete

whereas legacy strings use one block for the structure and one block

[265] Fix | Delete

for characters.

[266] Fix | Delete

[267] Fix | Delete

Legacy strings are created by PyUnicode_FromUnicode() and

[268] Fix | Delete

PyUnicode_FromStringAndSize(NULL, size) functions. They become ready

[269] Fix | Delete

when PyUnicode_READY() is called.

[270] Fix | Delete

[271] Fix | Delete