gh-139353: Add Objects/unicode_writer.c file (#139911)
Move the public PyUnicodeWriter API and the private _PyUnicodeWriter API to a new Objects/unicode_writer.c file. Rename a few helper functions to share them between unicodeobject.c and unicode_writer.c, such as resize_compact() or unicode_result().
This commit is contained in:
@@ -17,6 +17,46 @@ extern "C" {
|
||||
|
||||
|
||||
extern int _PyUnicode_IsModifiable(PyObject *unicode);
|
||||
extern void _PyUnicodeWriter_InitWithBuffer(
|
||||
_PyUnicodeWriter *writer,
|
||||
PyObject *buffer);
|
||||
extern PyObject* _PyUnicode_Result(PyObject *unicode);
|
||||
extern int _PyUnicode_DecodeUTF8Writer(
|
||||
_PyUnicodeWriter *writer,
|
||||
const char *s,
|
||||
Py_ssize_t size,
|
||||
_Py_error_handler error_handler,
|
||||
const char *errors,
|
||||
Py_ssize_t *consumed);
|
||||
extern PyObject* _PyUnicode_ResizeCompact(
|
||||
PyObject *unicode,
|
||||
Py_ssize_t length);
|
||||
extern PyObject* _PyUnicode_GetEmpty(void);
|
||||
|
||||
|
||||
/* Generic helper macro to convert characters of different types.
|
||||
from_type and to_type have to be valid type names, begin and end
|
||||
are pointers to the source characters which should be of type
|
||||
"from_type *". to is a pointer of type "to_type *" and points to the
|
||||
buffer where the result characters are written to. */
|
||||
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
|
||||
do { \
|
||||
to_type *_to = (to_type *)(to); \
|
||||
const from_type *_iter = (const from_type *)(begin);\
|
||||
const from_type *_end = (const from_type *)(end);\
|
||||
Py_ssize_t n = (_end) - (_iter); \
|
||||
const from_type *_unrolled_end = \
|
||||
_iter + _Py_SIZE_ROUND_DOWN(n, 4); \
|
||||
while (_iter < (_unrolled_end)) { \
|
||||
_to[0] = (to_type) _iter[0]; \
|
||||
_to[1] = (to_type) _iter[1]; \
|
||||
_to[2] = (to_type) _iter[2]; \
|
||||
_to[3] = (to_type) _iter[3]; \
|
||||
_iter += 4; _to += 4; \
|
||||
} \
|
||||
while (_iter < (_end)) \
|
||||
*_to++ = (to_type) *_iter++; \
|
||||
} while (0)
|
||||
|
||||
|
||||
static inline void
|
||||
|
||||
@@ -559,6 +559,7 @@ OBJECT_OBJS= \
|
||||
Objects/typevarobject.o \
|
||||
Objects/unicode_format.o \
|
||||
Objects/unicode_formatter.o \
|
||||
Objects/unicode_writer.o \
|
||||
Objects/unicodectype.o \
|
||||
Objects/unicodeobject.o \
|
||||
Objects/unionobject.o \
|
||||
|
||||
639
Objects/unicode_writer.c
Normal file
639
Objects/unicode_writer.c
Normal file
@@ -0,0 +1,639 @@
|
||||
/*
|
||||
|
||||
Unicode implementation based on original code by Fredrik Lundh,
|
||||
modified by Marc-Andre Lemburg <mal@lemburg.com>.
|
||||
|
||||
Major speed upgrades to the method implementations at the Reykjavik
|
||||
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
|
||||
|
||||
Copyright (c) Corporation for National Research Initiatives.
|
||||
|
||||
--------------------------------------------------------------------
|
||||
The original string type implementation is:
|
||||
|
||||
Copyright (c) 1999 by Secret Labs AB
|
||||
Copyright (c) 1999 by Fredrik Lundh
|
||||
|
||||
By obtaining, using, and/or copying this software and/or its
|
||||
associated documentation, you agree that you have read, understood,
|
||||
and will comply with the following terms and conditions:
|
||||
|
||||
Permission to use, copy, modify, and distribute this software and its
|
||||
associated documentation for any purpose and without fee is hereby
|
||||
granted, provided that the above copyright notice appears in all
|
||||
copies, and that both that copyright notice and this permission notice
|
||||
appear in supporting documentation, and that the name of Secret Labs
|
||||
AB or the author not be used in advertising or publicity pertaining to
|
||||
distribution of the software without specific, written prior
|
||||
permission.
|
||||
|
||||
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
|
||||
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
|
||||
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
||||
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
--------------------------------------------------------------------
|
||||
|
||||
*/
|
||||
|
||||
#include "Python.h"
|
||||
#include "pycore_freelist.h" // _Py_FREELIST_FREE()
|
||||
#include "pycore_long.h" // _PyLong_FormatWriter()
|
||||
#include "pycore_unicodeobject.h" // _PyUnicode_Result()
|
||||
|
||||
|
||||
#ifdef MS_WINDOWS
|
||||
/* On Windows, overallocate by 50% is the best factor */
|
||||
# define OVERALLOCATE_FACTOR 2
|
||||
#else
|
||||
/* On Linux, overallocate by 25% is the best factor */
|
||||
# define OVERALLOCATE_FACTOR 4
|
||||
#endif
|
||||
|
||||
|
||||
/* Compilation of templated routines */
|
||||
|
||||
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
|
||||
|
||||
#include "stringlib/ucs1lib.h"
|
||||
#include "stringlib/find_max_char.h"
|
||||
#include "stringlib/undef.h"
|
||||
|
||||
|
||||
/* Copy an ASCII or latin1 char* string into a Python Unicode string.
|
||||
|
||||
WARNING: The function doesn't copy the terminating null character and
|
||||
doesn't check the maximum character (may write a latin1 character in an
|
||||
ASCII string). */
|
||||
static void
|
||||
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
|
||||
const char *str, Py_ssize_t len)
|
||||
{
|
||||
int kind = PyUnicode_KIND(unicode);
|
||||
const void *data = PyUnicode_DATA(unicode);
|
||||
const char *end = str + len;
|
||||
|
||||
assert(index + len <= PyUnicode_GET_LENGTH(unicode));
|
||||
switch (kind) {
|
||||
case PyUnicode_1BYTE_KIND: {
|
||||
#ifdef Py_DEBUG
|
||||
if (PyUnicode_IS_ASCII(unicode)) {
|
||||
Py_UCS4 maxchar = ucs1lib_find_max_char(
|
||||
(const Py_UCS1*)str,
|
||||
(const Py_UCS1*)str + len);
|
||||
assert(maxchar < 128);
|
||||
}
|
||||
#endif
|
||||
memcpy((char *) data + index, str, len);
|
||||
break;
|
||||
}
|
||||
case PyUnicode_2BYTE_KIND: {
|
||||
Py_UCS2 *start = (Py_UCS2 *)data + index;
|
||||
Py_UCS2 *ucs2 = start;
|
||||
|
||||
for (; str < end; ++ucs2, ++str)
|
||||
*ucs2 = (Py_UCS2)*str;
|
||||
|
||||
assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
|
||||
break;
|
||||
}
|
||||
case PyUnicode_4BYTE_KIND: {
|
||||
Py_UCS4 *start = (Py_UCS4 *)data + index;
|
||||
Py_UCS4 *ucs4 = start;
|
||||
|
||||
for (; str < end; ++ucs4, ++str)
|
||||
*ucs4 = (Py_UCS4)*str;
|
||||
|
||||
assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
Py_UNREACHABLE();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
|
||||
{
|
||||
writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
|
||||
writer->data = PyUnicode_DATA(writer->buffer);
|
||||
|
||||
if (!writer->readonly) {
|
||||
writer->kind = PyUnicode_KIND(writer->buffer);
|
||||
writer->size = PyUnicode_GET_LENGTH(writer->buffer);
|
||||
}
|
||||
else {
|
||||
/* use a value smaller than PyUnicode_1BYTE_KIND() so
|
||||
_PyUnicodeWriter_PrepareKind() will copy the buffer. */
|
||||
writer->kind = 0;
|
||||
assert(writer->kind <= PyUnicode_1BYTE_KIND);
|
||||
|
||||
/* Copy-on-write mode: set buffer size to 0 so
|
||||
* _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
|
||||
* next write. */
|
||||
writer->size = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
|
||||
{
|
||||
memset(writer, 0, sizeof(*writer));
|
||||
|
||||
/* ASCII is the bare minimum */
|
||||
writer->min_char = 127;
|
||||
|
||||
/* use a kind value smaller than PyUnicode_1BYTE_KIND so
|
||||
_PyUnicodeWriter_PrepareKind() will copy the buffer. */
|
||||
assert(writer->kind == 0);
|
||||
assert(writer->kind < PyUnicode_1BYTE_KIND);
|
||||
}
|
||||
|
||||
|
||||
PyUnicodeWriter*
|
||||
PyUnicodeWriter_Create(Py_ssize_t length)
|
||||
{
|
||||
if (length < 0) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"length must be positive");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const size_t size = sizeof(_PyUnicodeWriter);
|
||||
PyUnicodeWriter *pub_writer;
|
||||
pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
|
||||
if (pub_writer == NULL) {
|
||||
pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
|
||||
if (pub_writer == NULL) {
|
||||
return (PyUnicodeWriter *)PyErr_NoMemory();
|
||||
}
|
||||
}
|
||||
_PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
|
||||
|
||||
_PyUnicodeWriter_Init(writer);
|
||||
if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
|
||||
PyUnicodeWriter_Discard(pub_writer);
|
||||
return NULL;
|
||||
}
|
||||
writer->overallocate = 1;
|
||||
|
||||
return pub_writer;
|
||||
}
|
||||
|
||||
|
||||
void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
|
||||
{
|
||||
if (writer == NULL) {
|
||||
return;
|
||||
}
|
||||
_PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
|
||||
_Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
|
||||
}
|
||||
|
||||
|
||||
// Initialize _PyUnicodeWriter with initial buffer
|
||||
void
|
||||
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
|
||||
{
|
||||
memset(writer, 0, sizeof(*writer));
|
||||
writer->buffer = buffer;
|
||||
_PyUnicodeWriter_Update(writer);
|
||||
writer->min_length = writer->size;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
|
||||
Py_ssize_t length, Py_UCS4 maxchar)
|
||||
{
|
||||
Py_ssize_t newlen;
|
||||
PyObject *newbuffer;
|
||||
|
||||
assert(length >= 0);
|
||||
assert(maxchar <= _Py_MAX_UNICODE);
|
||||
|
||||
/* ensure that the _PyUnicodeWriter_Prepare macro was used */
|
||||
assert((maxchar > writer->maxchar && length >= 0)
|
||||
|| length > 0);
|
||||
|
||||
if (length > PY_SSIZE_T_MAX - writer->pos) {
|
||||
PyErr_NoMemory();
|
||||
return -1;
|
||||
}
|
||||
newlen = writer->pos + length;
|
||||
|
||||
maxchar = Py_MAX(maxchar, writer->min_char);
|
||||
|
||||
if (writer->buffer == NULL) {
|
||||
assert(!writer->readonly);
|
||||
if (writer->overallocate
|
||||
&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
|
||||
/* overallocate to limit the number of realloc() */
|
||||
newlen += newlen / OVERALLOCATE_FACTOR;
|
||||
}
|
||||
if (newlen < writer->min_length)
|
||||
newlen = writer->min_length;
|
||||
|
||||
writer->buffer = PyUnicode_New(newlen, maxchar);
|
||||
if (writer->buffer == NULL)
|
||||
return -1;
|
||||
}
|
||||
else if (newlen > writer->size) {
|
||||
if (writer->overallocate
|
||||
&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
|
||||
/* overallocate to limit the number of realloc() */
|
||||
newlen += newlen / OVERALLOCATE_FACTOR;
|
||||
}
|
||||
if (newlen < writer->min_length)
|
||||
newlen = writer->min_length;
|
||||
|
||||
if (maxchar > writer->maxchar || writer->readonly) {
|
||||
/* resize + widen */
|
||||
maxchar = Py_MAX(maxchar, writer->maxchar);
|
||||
newbuffer = PyUnicode_New(newlen, maxchar);
|
||||
if (newbuffer == NULL)
|
||||
return -1;
|
||||
_PyUnicode_FastCopyCharacters(newbuffer, 0,
|
||||
writer->buffer, 0, writer->pos);
|
||||
Py_DECREF(writer->buffer);
|
||||
writer->readonly = 0;
|
||||
}
|
||||
else {
|
||||
newbuffer = _PyUnicode_ResizeCompact(writer->buffer, newlen);
|
||||
if (newbuffer == NULL)
|
||||
return -1;
|
||||
}
|
||||
writer->buffer = newbuffer;
|
||||
}
|
||||
else if (maxchar > writer->maxchar) {
|
||||
assert(!writer->readonly);
|
||||
newbuffer = PyUnicode_New(writer->size, maxchar);
|
||||
if (newbuffer == NULL)
|
||||
return -1;
|
||||
_PyUnicode_FastCopyCharacters(newbuffer, 0,
|
||||
writer->buffer, 0, writer->pos);
|
||||
Py_SETREF(writer->buffer, newbuffer);
|
||||
}
|
||||
_PyUnicodeWriter_Update(writer);
|
||||
return 0;
|
||||
|
||||
#undef OVERALLOCATE_FACTOR
|
||||
}
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
|
||||
int kind)
|
||||
{
|
||||
Py_UCS4 maxchar;
|
||||
|
||||
/* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
|
||||
assert(writer->kind < kind);
|
||||
|
||||
switch (kind)
|
||||
{
|
||||
case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
|
||||
case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
|
||||
case PyUnicode_4BYTE_KIND: maxchar = _Py_MAX_UNICODE; break;
|
||||
default:
|
||||
Py_UNREACHABLE();
|
||||
}
|
||||
|
||||
return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
|
||||
{
|
||||
return _PyUnicodeWriter_WriteCharInline(writer, ch);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
|
||||
{
|
||||
if (ch > _Py_MAX_UNICODE) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"character must be in range(0x110000)");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
|
||||
{
|
||||
assert(PyUnicode_Check(str));
|
||||
|
||||
Py_UCS4 maxchar;
|
||||
Py_ssize_t len;
|
||||
|
||||
len = PyUnicode_GET_LENGTH(str);
|
||||
if (len == 0)
|
||||
return 0;
|
||||
maxchar = PyUnicode_MAX_CHAR_VALUE(str);
|
||||
if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
|
||||
if (writer->buffer == NULL && !writer->overallocate) {
|
||||
assert(_PyUnicode_CheckConsistency(str, 1));
|
||||
writer->readonly = 1;
|
||||
writer->buffer = Py_NewRef(str);
|
||||
_PyUnicodeWriter_Update(writer);
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
|
||||
return -1;
|
||||
}
|
||||
_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
|
||||
str, 0, len);
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
|
||||
{
|
||||
PyTypeObject *type = Py_TYPE(obj);
|
||||
if (type == &PyUnicode_Type) {
|
||||
return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
|
||||
}
|
||||
|
||||
if (type == &PyLong_Type) {
|
||||
return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
|
||||
}
|
||||
|
||||
PyObject *str = PyObject_Str(obj);
|
||||
if (str == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
|
||||
Py_DECREF(str);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
|
||||
{
|
||||
if (Py_TYPE(obj) == &PyLong_Type) {
|
||||
return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
|
||||
}
|
||||
|
||||
PyObject *repr = PyObject_Repr(obj);
|
||||
if (repr == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
|
||||
Py_DECREF(repr);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
|
||||
Py_ssize_t start, Py_ssize_t end)
|
||||
{
|
||||
assert(0 <= start);
|
||||
assert(end <= PyUnicode_GET_LENGTH(str));
|
||||
assert(start <= end);
|
||||
|
||||
if (start == 0 && end == PyUnicode_GET_LENGTH(str))
|
||||
return _PyUnicodeWriter_WriteStr(writer, str);
|
||||
|
||||
Py_ssize_t len = end - start;
|
||||
if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
Py_UCS4 maxchar;
|
||||
if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
|
||||
maxchar = _PyUnicode_FindMaxChar(str, start, end);
|
||||
}
|
||||
else {
|
||||
maxchar = writer->maxchar;
|
||||
}
|
||||
if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
|
||||
str, start, len);
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
|
||||
Py_ssize_t start, Py_ssize_t end)
|
||||
{
|
||||
if (!PyUnicode_Check(str)) {
|
||||
PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
|
||||
return -1;
|
||||
}
|
||||
if (start < 0 || start > end) {
|
||||
PyErr_Format(PyExc_ValueError, "invalid start argument");
|
||||
return -1;
|
||||
}
|
||||
if (end > PyUnicode_GET_LENGTH(str)) {
|
||||
PyErr_Format(PyExc_ValueError, "invalid end argument");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
|
||||
start, end);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
|
||||
const char *ascii, Py_ssize_t len)
|
||||
{
|
||||
if (len == -1)
|
||||
len = strlen(ascii);
|
||||
|
||||
assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
|
||||
|
||||
if (writer->buffer == NULL && !writer->overallocate) {
|
||||
PyObject *str;
|
||||
|
||||
str = _PyUnicode_FromASCII(ascii, len);
|
||||
if (str == NULL)
|
||||
return -1;
|
||||
|
||||
writer->readonly = 1;
|
||||
writer->buffer = str;
|
||||
_PyUnicodeWriter_Update(writer);
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
|
||||
return -1;
|
||||
|
||||
switch (writer->kind)
|
||||
{
|
||||
case PyUnicode_1BYTE_KIND:
|
||||
{
|
||||
const Py_UCS1 *str = (const Py_UCS1 *)ascii;
|
||||
Py_UCS1 *data = writer->data;
|
||||
|
||||
memcpy(data + writer->pos, str, len);
|
||||
break;
|
||||
}
|
||||
case PyUnicode_2BYTE_KIND:
|
||||
{
|
||||
_PyUnicode_CONVERT_BYTES(
|
||||
Py_UCS1, Py_UCS2,
|
||||
ascii, ascii + len,
|
||||
(Py_UCS2 *)writer->data + writer->pos);
|
||||
break;
|
||||
}
|
||||
case PyUnicode_4BYTE_KIND:
|
||||
{
|
||||
_PyUnicode_CONVERT_BYTES(
|
||||
Py_UCS1, Py_UCS4,
|
||||
ascii, ascii + len,
|
||||
(Py_UCS4 *)writer->data + writer->pos);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
Py_UNREACHABLE();
|
||||
}
|
||||
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
|
||||
const char *str,
|
||||
Py_ssize_t size)
|
||||
{
|
||||
assert(writer != NULL);
|
||||
_Py_AssertHoldsTstate();
|
||||
|
||||
_PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
|
||||
return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
|
||||
const char *str,
|
||||
Py_ssize_t size)
|
||||
{
|
||||
if (size < 0) {
|
||||
size = strlen(str);
|
||||
}
|
||||
|
||||
_PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
|
||||
Py_ssize_t old_pos = _writer->pos;
|
||||
int res = _PyUnicode_DecodeUTF8Writer(_writer, str, size,
|
||||
_Py_ERROR_STRICT, NULL, NULL);
|
||||
if (res < 0) {
|
||||
_writer->pos = old_pos;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
|
||||
const char *string,
|
||||
Py_ssize_t length,
|
||||
const char *errors,
|
||||
Py_ssize_t *consumed)
|
||||
{
|
||||
if (length < 0) {
|
||||
length = strlen(string);
|
||||
}
|
||||
|
||||
_PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
|
||||
Py_ssize_t old_pos = _writer->pos;
|
||||
int res = _PyUnicode_DecodeUTF8Writer(_writer, string, length,
|
||||
_Py_ERROR_UNKNOWN, errors,
|
||||
consumed);
|
||||
if (res < 0) {
|
||||
_writer->pos = old_pos;
|
||||
if (consumed) {
|
||||
*consumed = 0;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
|
||||
const char *str, Py_ssize_t len)
|
||||
{
|
||||
Py_UCS4 maxchar;
|
||||
|
||||
maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
|
||||
if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
|
||||
return -1;
|
||||
unicode_write_cstr(writer->buffer, writer->pos, str, len);
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
PyObject *
|
||||
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
|
||||
{
|
||||
PyObject *str;
|
||||
|
||||
if (writer->pos == 0) {
|
||||
Py_CLEAR(writer->buffer);
|
||||
return _PyUnicode_GetEmpty();
|
||||
}
|
||||
|
||||
str = writer->buffer;
|
||||
writer->buffer = NULL;
|
||||
|
||||
if (writer->readonly) {
|
||||
assert(PyUnicode_GET_LENGTH(str) == writer->pos);
|
||||
return str;
|
||||
}
|
||||
|
||||
if (PyUnicode_GET_LENGTH(str) != writer->pos) {
|
||||
PyObject *str2;
|
||||
str2 = _PyUnicode_ResizeCompact(str, writer->pos);
|
||||
if (str2 == NULL) {
|
||||
Py_DECREF(str);
|
||||
return NULL;
|
||||
}
|
||||
str = str2;
|
||||
}
|
||||
|
||||
assert(_PyUnicode_CheckConsistency(str, 1));
|
||||
return _PyUnicode_Result(str);
|
||||
}
|
||||
|
||||
|
||||
PyObject*
|
||||
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
|
||||
{
|
||||
PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
|
||||
assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
|
||||
_Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
|
||||
return str;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
|
||||
{
|
||||
Py_CLEAR(writer->buffer);
|
||||
}
|
||||
@@ -46,7 +46,6 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
#include "pycore_codecs.h" // _PyCodec_Lookup()
|
||||
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
|
||||
#include "pycore_format.h" // F_LJUST
|
||||
#include "pycore_freelist.h" // _Py_FREELIST_FREE(), _Py_FREELIST_POP()
|
||||
#include "pycore_initconfig.h" // _PyStatus_OK()
|
||||
#include "pycore_interp.h" // PyInterpreterState.fs_codec
|
||||
#include "pycore_long.h" // _PyLong_FormatWriter()
|
||||
@@ -184,45 +183,9 @@ static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
|
||||
}
|
||||
|
||||
|
||||
/* Generic helper macro to convert characters of different types.
|
||||
from_type and to_type have to be valid type names, begin and end
|
||||
are pointers to the source characters which should be of type
|
||||
"from_type *". to is a pointer of type "to_type *" and points to the
|
||||
buffer where the result characters are written to. */
|
||||
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
|
||||
do { \
|
||||
to_type *_to = (to_type *)(to); \
|
||||
const from_type *_iter = (const from_type *)(begin);\
|
||||
const from_type *_end = (const from_type *)(end);\
|
||||
Py_ssize_t n = (_end) - (_iter); \
|
||||
const from_type *_unrolled_end = \
|
||||
_iter + _Py_SIZE_ROUND_DOWN(n, 4); \
|
||||
while (_iter < (_unrolled_end)) { \
|
||||
_to[0] = (to_type) _iter[0]; \
|
||||
_to[1] = (to_type) _iter[1]; \
|
||||
_to[2] = (to_type) _iter[2]; \
|
||||
_to[3] = (to_type) _iter[3]; \
|
||||
_iter += 4; _to += 4; \
|
||||
} \
|
||||
while (_iter < (_end)) \
|
||||
*_to++ = (to_type) *_iter++; \
|
||||
} while (0)
|
||||
|
||||
#define LATIN1 _Py_LATIN1_CHR
|
||||
|
||||
#ifdef MS_WINDOWS
|
||||
/* On Windows, overallocate by 50% is the best factor */
|
||||
# define OVERALLOCATE_FACTOR 2
|
||||
#else
|
||||
/* On Linux, overallocate by 25% is the best factor */
|
||||
# define OVERALLOCATE_FACTOR 4
|
||||
#endif
|
||||
|
||||
/* Forward declaration */
|
||||
static inline int
|
||||
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
|
||||
static inline void
|
||||
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
|
||||
static PyObject *
|
||||
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
|
||||
const char *errors);
|
||||
@@ -230,11 +193,6 @@ static PyObject *
|
||||
unicode_decode_utf8(const char *s, Py_ssize_t size,
|
||||
_Py_error_handler error_handler, const char *errors,
|
||||
Py_ssize_t *consumed);
|
||||
static int
|
||||
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
|
||||
const char *s, Py_ssize_t size,
|
||||
_Py_error_handler error_handler, const char *errors,
|
||||
Py_ssize_t *consumed);
|
||||
#ifdef Py_DEBUG
|
||||
static inline int unicode_is_finalizing(void);
|
||||
static int unicode_is_singleton(PyObject *unicode);
|
||||
@@ -242,7 +200,8 @@ static int unicode_is_singleton(PyObject *unicode);
|
||||
|
||||
|
||||
// Return a reference to the immortal empty string singleton.
|
||||
static inline PyObject* unicode_get_empty(void)
|
||||
PyObject*
|
||||
_PyUnicode_GetEmpty(void)
|
||||
{
|
||||
_Py_DECLARE_STR(empty, "");
|
||||
return &_Py_STR(empty);
|
||||
@@ -416,7 +375,7 @@ static void clear_global_interned_strings(void)
|
||||
|
||||
#define _Py_RETURN_UNICODE_EMPTY() \
|
||||
do { \
|
||||
return unicode_get_empty(); \
|
||||
return _PyUnicode_GetEmpty();\
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -748,14 +707,14 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
|
||||
#undef CHECK
|
||||
}
|
||||
|
||||
static PyObject*
|
||||
unicode_result(PyObject *unicode)
|
||||
PyObject*
|
||||
_PyUnicode_Result(PyObject *unicode)
|
||||
{
|
||||
assert(_PyUnicode_CHECK(unicode));
|
||||
|
||||
Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
|
||||
if (length == 0) {
|
||||
PyObject *empty = unicode_get_empty();
|
||||
PyObject *empty = _PyUnicode_GetEmpty();
|
||||
if (unicode != empty) {
|
||||
Py_DECREF(unicode);
|
||||
}
|
||||
@@ -778,6 +737,7 @@ unicode_result(PyObject *unicode)
|
||||
assert(_PyUnicode_CheckConsistency(unicode, 1));
|
||||
return unicode;
|
||||
}
|
||||
#define unicode_result _PyUnicode_Result
|
||||
|
||||
static PyObject*
|
||||
unicode_result_unchanged(PyObject *unicode)
|
||||
@@ -985,7 +945,7 @@ make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
|
||||
|
||||
/* Compilation of templated routines */
|
||||
|
||||
#define STRINGLIB_GET_EMPTY() unicode_get_empty()
|
||||
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
|
||||
|
||||
#include "stringlib/asciilib.h"
|
||||
#include "stringlib/fastsearch.h"
|
||||
@@ -1097,8 +1057,8 @@ resize_copy(PyObject *unicode, Py_ssize_t length)
|
||||
return copy;
|
||||
}
|
||||
|
||||
static PyObject*
|
||||
resize_compact(PyObject *unicode, Py_ssize_t length)
|
||||
PyObject*
|
||||
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
|
||||
{
|
||||
Py_ssize_t char_size;
|
||||
Py_ssize_t struct_size;
|
||||
@@ -1306,7 +1266,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
|
||||
{
|
||||
/* Optimization for empty strings */
|
||||
if (size == 0) {
|
||||
return unicode_get_empty();
|
||||
return _PyUnicode_GetEmpty();
|
||||
}
|
||||
|
||||
PyObject *obj;
|
||||
@@ -1799,7 +1759,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
|
||||
return 0;
|
||||
|
||||
if (length == 0) {
|
||||
PyObject *empty = unicode_get_empty();
|
||||
PyObject *empty = _PyUnicode_GetEmpty();
|
||||
Py_SETREF(*p_unicode, empty);
|
||||
return 0;
|
||||
}
|
||||
@@ -1813,7 +1773,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
|
||||
}
|
||||
|
||||
if (PyUnicode_IS_COMPACT(unicode)) {
|
||||
PyObject *new_unicode = resize_compact(unicode, length);
|
||||
PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
|
||||
if (new_unicode == NULL)
|
||||
return -1;
|
||||
*p_unicode = new_unicode;
|
||||
@@ -1839,58 +1799,6 @@ PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
|
||||
return unicode_resize(p_unicode, length);
|
||||
}
|
||||
|
||||
/* Copy an ASCII or latin1 char* string into a Python Unicode string.
|
||||
|
||||
WARNING: The function doesn't copy the terminating null character and
|
||||
doesn't check the maximum character (may write a latin1 character in an
|
||||
ASCII string). */
|
||||
static void
|
||||
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
|
||||
const char *str, Py_ssize_t len)
|
||||
{
|
||||
int kind = PyUnicode_KIND(unicode);
|
||||
const void *data = PyUnicode_DATA(unicode);
|
||||
const char *end = str + len;
|
||||
|
||||
assert(index + len <= PyUnicode_GET_LENGTH(unicode));
|
||||
switch (kind) {
|
||||
case PyUnicode_1BYTE_KIND: {
|
||||
#ifdef Py_DEBUG
|
||||
if (PyUnicode_IS_ASCII(unicode)) {
|
||||
Py_UCS4 maxchar = ucs1lib_find_max_char(
|
||||
(const Py_UCS1*)str,
|
||||
(const Py_UCS1*)str + len);
|
||||
assert(maxchar < 128);
|
||||
}
|
||||
#endif
|
||||
memcpy((char *) data + index, str, len);
|
||||
break;
|
||||
}
|
||||
case PyUnicode_2BYTE_KIND: {
|
||||
Py_UCS2 *start = (Py_UCS2 *)data + index;
|
||||
Py_UCS2 *ucs2 = start;
|
||||
|
||||
for (; str < end; ++ucs2, ++str)
|
||||
*ucs2 = (Py_UCS2)*str;
|
||||
|
||||
assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
|
||||
break;
|
||||
}
|
||||
case PyUnicode_4BYTE_KIND: {
|
||||
Py_UCS4 *start = (Py_UCS4 *)data + index;
|
||||
Py_UCS4 *ucs4 = start;
|
||||
|
||||
for (; str < end; ++ucs4, ++str)
|
||||
*ucs4 = (Py_UCS4)*str;
|
||||
|
||||
assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
Py_UNREACHABLE();
|
||||
}
|
||||
}
|
||||
|
||||
static PyObject*
|
||||
get_latin1_char(Py_UCS1 ch)
|
||||
{
|
||||
@@ -2105,7 +2013,7 @@ PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
|
||||
"NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
|
||||
return NULL;
|
||||
}
|
||||
return unicode_get_empty();
|
||||
return _PyUnicode_GetEmpty();
|
||||
}
|
||||
|
||||
PyObject *
|
||||
@@ -2672,8 +2580,8 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
|
||||
}
|
||||
|
||||
if (width < 0) {
|
||||
return unicode_decode_utf8_writer(writer, str, length,
|
||||
_Py_ERROR_REPLACE, "replace", pconsumed);
|
||||
return _PyUnicode_DecodeUTF8Writer(writer, str, length,
|
||||
_Py_ERROR_REPLACE, "replace", pconsumed);
|
||||
}
|
||||
|
||||
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
|
||||
@@ -5424,11 +5332,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
|
||||
|
||||
|
||||
// Used by PyUnicodeWriter_WriteUTF8() implementation
|
||||
static int
|
||||
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
|
||||
const char *s, Py_ssize_t size,
|
||||
_Py_error_handler error_handler, const char *errors,
|
||||
Py_ssize_t *consumed)
|
||||
int
|
||||
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
|
||||
const char *s, Py_ssize_t size,
|
||||
_Py_error_handler error_handler, const char *errors,
|
||||
Py_ssize_t *consumed)
|
||||
{
|
||||
if (size == 0) {
|
||||
if (consumed) {
|
||||
@@ -10766,7 +10674,7 @@ replace(PyObject *self, PyObject *str1,
|
||||
}
|
||||
new_size = slen + n * (len2 - len1);
|
||||
if (new_size == 0) {
|
||||
u = unicode_get_empty();
|
||||
u = _PyUnicode_GetEmpty();
|
||||
goto done;
|
||||
}
|
||||
if (new_size > (PY_SSIZE_T_MAX / rkind)) {
|
||||
@@ -11439,7 +11347,7 @@ PyUnicode_Concat(PyObject *left, PyObject *right)
|
||||
}
|
||||
|
||||
/* Shortcuts */
|
||||
PyObject *empty = unicode_get_empty(); // Borrowed reference
|
||||
PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference
|
||||
if (left == empty) {
|
||||
return PyUnicode_FromObject(right);
|
||||
}
|
||||
@@ -11491,7 +11399,7 @@ PyUnicode_Append(PyObject **p_left, PyObject *right)
|
||||
}
|
||||
|
||||
/* Shortcuts */
|
||||
PyObject *empty = unicode_get_empty(); // Borrowed reference
|
||||
PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference
|
||||
if (left == empty) {
|
||||
Py_DECREF(left);
|
||||
*p_left = Py_NewRef(right);
|
||||
@@ -12987,7 +12895,7 @@ PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
|
||||
len1 = PyUnicode_GET_LENGTH(str_obj);
|
||||
len2 = PyUnicode_GET_LENGTH(sep_obj);
|
||||
if (kind1 < kind2 || len1 < len2) {
|
||||
PyObject *empty = unicode_get_empty(); // Borrowed reference
|
||||
PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference
|
||||
return PyTuple_Pack(3, str_obj, empty, empty);
|
||||
}
|
||||
buf1 = PyUnicode_DATA(str_obj);
|
||||
@@ -13039,7 +12947,7 @@ PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
|
||||
len1 = PyUnicode_GET_LENGTH(str_obj);
|
||||
len2 = PyUnicode_GET_LENGTH(sep_obj);
|
||||
if (kind1 < kind2 || len1 < len2) {
|
||||
PyObject *empty = unicode_get_empty(); // Borrowed reference
|
||||
PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference
|
||||
return PyTuple_Pack(3, empty, empty, str_obj);
|
||||
}
|
||||
buf1 = PyUnicode_DATA(str_obj);
|
||||
@@ -13518,523 +13426,6 @@ unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
|
||||
{
|
||||
writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
|
||||
writer->data = PyUnicode_DATA(writer->buffer);
|
||||
|
||||
if (!writer->readonly) {
|
||||
writer->kind = PyUnicode_KIND(writer->buffer);
|
||||
writer->size = PyUnicode_GET_LENGTH(writer->buffer);
|
||||
}
|
||||
else {
|
||||
/* use a value smaller than PyUnicode_1BYTE_KIND() so
|
||||
_PyUnicodeWriter_PrepareKind() will copy the buffer. */
|
||||
writer->kind = 0;
|
||||
assert(writer->kind <= PyUnicode_1BYTE_KIND);
|
||||
|
||||
/* Copy-on-write mode: set buffer size to 0 so
|
||||
* _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
|
||||
* next write. */
|
||||
writer->size = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
|
||||
{
|
||||
memset(writer, 0, sizeof(*writer));
|
||||
|
||||
/* ASCII is the bare minimum */
|
||||
writer->min_char = 127;
|
||||
|
||||
/* use a kind value smaller than PyUnicode_1BYTE_KIND so
|
||||
_PyUnicodeWriter_PrepareKind() will copy the buffer. */
|
||||
assert(writer->kind == 0);
|
||||
assert(writer->kind < PyUnicode_1BYTE_KIND);
|
||||
}
|
||||
|
||||
|
||||
PyUnicodeWriter*
|
||||
PyUnicodeWriter_Create(Py_ssize_t length)
|
||||
{
|
||||
if (length < 0) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"length must be positive");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const size_t size = sizeof(_PyUnicodeWriter);
|
||||
PyUnicodeWriter *pub_writer;
|
||||
pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
|
||||
if (pub_writer == NULL) {
|
||||
pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
|
||||
if (pub_writer == NULL) {
|
||||
return (PyUnicodeWriter *)PyErr_NoMemory();
|
||||
}
|
||||
}
|
||||
_PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
|
||||
|
||||
_PyUnicodeWriter_Init(writer);
|
||||
if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
|
||||
PyUnicodeWriter_Discard(pub_writer);
|
||||
return NULL;
|
||||
}
|
||||
writer->overallocate = 1;
|
||||
|
||||
return pub_writer;
|
||||
}
|
||||
|
||||
|
||||
void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
|
||||
{
|
||||
if (writer == NULL) {
|
||||
return;
|
||||
}
|
||||
_PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
|
||||
_Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
|
||||
}
|
||||
|
||||
|
||||
// Initialize _PyUnicodeWriter with initial buffer
|
||||
static inline void
|
||||
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
|
||||
{
|
||||
memset(writer, 0, sizeof(*writer));
|
||||
writer->buffer = buffer;
|
||||
_PyUnicodeWriter_Update(writer);
|
||||
writer->min_length = writer->size;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
|
||||
Py_ssize_t length, Py_UCS4 maxchar)
|
||||
{
|
||||
Py_ssize_t newlen;
|
||||
PyObject *newbuffer;
|
||||
|
||||
assert(length >= 0);
|
||||
assert(maxchar <= MAX_UNICODE);
|
||||
|
||||
/* ensure that the _PyUnicodeWriter_Prepare macro was used */
|
||||
assert((maxchar > writer->maxchar && length >= 0)
|
||||
|| length > 0);
|
||||
|
||||
if (length > PY_SSIZE_T_MAX - writer->pos) {
|
||||
PyErr_NoMemory();
|
||||
return -1;
|
||||
}
|
||||
newlen = writer->pos + length;
|
||||
|
||||
maxchar = Py_MAX(maxchar, writer->min_char);
|
||||
|
||||
if (writer->buffer == NULL) {
|
||||
assert(!writer->readonly);
|
||||
if (writer->overallocate
|
||||
&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
|
||||
/* overallocate to limit the number of realloc() */
|
||||
newlen += newlen / OVERALLOCATE_FACTOR;
|
||||
}
|
||||
if (newlen < writer->min_length)
|
||||
newlen = writer->min_length;
|
||||
|
||||
writer->buffer = PyUnicode_New(newlen, maxchar);
|
||||
if (writer->buffer == NULL)
|
||||
return -1;
|
||||
}
|
||||
else if (newlen > writer->size) {
|
||||
if (writer->overallocate
|
||||
&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
|
||||
/* overallocate to limit the number of realloc() */
|
||||
newlen += newlen / OVERALLOCATE_FACTOR;
|
||||
}
|
||||
if (newlen < writer->min_length)
|
||||
newlen = writer->min_length;
|
||||
|
||||
if (maxchar > writer->maxchar || writer->readonly) {
|
||||
/* resize + widen */
|
||||
maxchar = Py_MAX(maxchar, writer->maxchar);
|
||||
newbuffer = PyUnicode_New(newlen, maxchar);
|
||||
if (newbuffer == NULL)
|
||||
return -1;
|
||||
_PyUnicode_FastCopyCharacters(newbuffer, 0,
|
||||
writer->buffer, 0, writer->pos);
|
||||
Py_DECREF(writer->buffer);
|
||||
writer->readonly = 0;
|
||||
}
|
||||
else {
|
||||
newbuffer = resize_compact(writer->buffer, newlen);
|
||||
if (newbuffer == NULL)
|
||||
return -1;
|
||||
}
|
||||
writer->buffer = newbuffer;
|
||||
}
|
||||
else if (maxchar > writer->maxchar) {
|
||||
assert(!writer->readonly);
|
||||
newbuffer = PyUnicode_New(writer->size, maxchar);
|
||||
if (newbuffer == NULL)
|
||||
return -1;
|
||||
_PyUnicode_FastCopyCharacters(newbuffer, 0,
|
||||
writer->buffer, 0, writer->pos);
|
||||
Py_SETREF(writer->buffer, newbuffer);
|
||||
}
|
||||
_PyUnicodeWriter_Update(writer);
|
||||
return 0;
|
||||
|
||||
#undef OVERALLOCATE_FACTOR
|
||||
}
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
|
||||
int kind)
|
||||
{
|
||||
Py_UCS4 maxchar;
|
||||
|
||||
/* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
|
||||
assert(writer->kind < kind);
|
||||
|
||||
switch (kind)
|
||||
{
|
||||
case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
|
||||
case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
|
||||
case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
|
||||
default:
|
||||
Py_UNREACHABLE();
|
||||
}
|
||||
|
||||
return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
|
||||
}
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
|
||||
{
|
||||
return _PyUnicodeWriter_WriteCharInline(writer, ch);
|
||||
}
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
|
||||
{
|
||||
if (ch > MAX_UNICODE) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"character must be in range(0x110000)");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
|
||||
}
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
|
||||
{
|
||||
assert(PyUnicode_Check(str));
|
||||
|
||||
Py_UCS4 maxchar;
|
||||
Py_ssize_t len;
|
||||
|
||||
len = PyUnicode_GET_LENGTH(str);
|
||||
if (len == 0)
|
||||
return 0;
|
||||
maxchar = PyUnicode_MAX_CHAR_VALUE(str);
|
||||
if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
|
||||
if (writer->buffer == NULL && !writer->overallocate) {
|
||||
assert(_PyUnicode_CheckConsistency(str, 1));
|
||||
writer->readonly = 1;
|
||||
writer->buffer = Py_NewRef(str);
|
||||
_PyUnicodeWriter_Update(writer);
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
|
||||
return -1;
|
||||
}
|
||||
_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
|
||||
str, 0, len);
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
|
||||
{
|
||||
PyTypeObject *type = Py_TYPE(obj);
|
||||
if (type == &PyUnicode_Type) {
|
||||
return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
|
||||
}
|
||||
|
||||
if (type == &PyLong_Type) {
|
||||
return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
|
||||
}
|
||||
|
||||
PyObject *str = PyObject_Str(obj);
|
||||
if (str == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
|
||||
Py_DECREF(str);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
|
||||
{
|
||||
if (Py_TYPE(obj) == &PyLong_Type) {
|
||||
return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
|
||||
}
|
||||
|
||||
PyObject *repr = PyObject_Repr(obj);
|
||||
if (repr == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
|
||||
Py_DECREF(repr);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
|
||||
Py_ssize_t start, Py_ssize_t end)
|
||||
{
|
||||
assert(0 <= start);
|
||||
assert(end <= PyUnicode_GET_LENGTH(str));
|
||||
assert(start <= end);
|
||||
|
||||
if (start == 0 && end == PyUnicode_GET_LENGTH(str))
|
||||
return _PyUnicodeWriter_WriteStr(writer, str);
|
||||
|
||||
Py_ssize_t len = end - start;
|
||||
if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
Py_UCS4 maxchar;
|
||||
if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
|
||||
maxchar = _PyUnicode_FindMaxChar(str, start, end);
|
||||
}
|
||||
else {
|
||||
maxchar = writer->maxchar;
|
||||
}
|
||||
if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
|
||||
str, start, len);
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
|
||||
Py_ssize_t start, Py_ssize_t end)
|
||||
{
|
||||
if (!PyUnicode_Check(str)) {
|
||||
PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
|
||||
return -1;
|
||||
}
|
||||
if (start < 0 || start > end) {
|
||||
PyErr_Format(PyExc_ValueError, "invalid start argument");
|
||||
return -1;
|
||||
}
|
||||
if (end > PyUnicode_GET_LENGTH(str)) {
|
||||
PyErr_Format(PyExc_ValueError, "invalid end argument");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
|
||||
start, end);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
|
||||
const char *ascii, Py_ssize_t len)
|
||||
{
|
||||
if (len == -1)
|
||||
len = strlen(ascii);
|
||||
|
||||
assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
|
||||
|
||||
if (writer->buffer == NULL && !writer->overallocate) {
|
||||
PyObject *str;
|
||||
|
||||
str = _PyUnicode_FromASCII(ascii, len);
|
||||
if (str == NULL)
|
||||
return -1;
|
||||
|
||||
writer->readonly = 1;
|
||||
writer->buffer = str;
|
||||
_PyUnicodeWriter_Update(writer);
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
|
||||
return -1;
|
||||
|
||||
switch (writer->kind)
|
||||
{
|
||||
case PyUnicode_1BYTE_KIND:
|
||||
{
|
||||
const Py_UCS1 *str = (const Py_UCS1 *)ascii;
|
||||
Py_UCS1 *data = writer->data;
|
||||
|
||||
memcpy(data + writer->pos, str, len);
|
||||
break;
|
||||
}
|
||||
case PyUnicode_2BYTE_KIND:
|
||||
{
|
||||
_PyUnicode_CONVERT_BYTES(
|
||||
Py_UCS1, Py_UCS2,
|
||||
ascii, ascii + len,
|
||||
(Py_UCS2 *)writer->data + writer->pos);
|
||||
break;
|
||||
}
|
||||
case PyUnicode_4BYTE_KIND:
|
||||
{
|
||||
_PyUnicode_CONVERT_BYTES(
|
||||
Py_UCS1, Py_UCS4,
|
||||
ascii, ascii + len,
|
||||
(Py_UCS4 *)writer->data + writer->pos);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
Py_UNREACHABLE();
|
||||
}
|
||||
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
|
||||
const char *str,
|
||||
Py_ssize_t size)
|
||||
{
|
||||
assert(writer != NULL);
|
||||
_Py_AssertHoldsTstate();
|
||||
|
||||
_PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
|
||||
return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
|
||||
const char *str,
|
||||
Py_ssize_t size)
|
||||
{
|
||||
if (size < 0) {
|
||||
size = strlen(str);
|
||||
}
|
||||
|
||||
_PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
|
||||
Py_ssize_t old_pos = _writer->pos;
|
||||
int res = unicode_decode_utf8_writer(_writer, str, size,
|
||||
_Py_ERROR_STRICT, NULL, NULL);
|
||||
if (res < 0) {
|
||||
_writer->pos = old_pos;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
|
||||
const char *string,
|
||||
Py_ssize_t length,
|
||||
const char *errors,
|
||||
Py_ssize_t *consumed)
|
||||
{
|
||||
if (length < 0) {
|
||||
length = strlen(string);
|
||||
}
|
||||
|
||||
_PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
|
||||
Py_ssize_t old_pos = _writer->pos;
|
||||
int res = unicode_decode_utf8_writer(_writer, string, length,
|
||||
_Py_ERROR_UNKNOWN, errors, consumed);
|
||||
if (res < 0) {
|
||||
_writer->pos = old_pos;
|
||||
if (consumed) {
|
||||
*consumed = 0;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
|
||||
const char *str, Py_ssize_t len)
|
||||
{
|
||||
Py_UCS4 maxchar;
|
||||
|
||||
maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
|
||||
if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
|
||||
return -1;
|
||||
unicode_write_cstr(writer->buffer, writer->pos, str, len);
|
||||
writer->pos += len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *
|
||||
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
|
||||
{
|
||||
PyObject *str;
|
||||
|
||||
if (writer->pos == 0) {
|
||||
Py_CLEAR(writer->buffer);
|
||||
_Py_RETURN_UNICODE_EMPTY();
|
||||
}
|
||||
|
||||
str = writer->buffer;
|
||||
writer->buffer = NULL;
|
||||
|
||||
if (writer->readonly) {
|
||||
assert(PyUnicode_GET_LENGTH(str) == writer->pos);
|
||||
return str;
|
||||
}
|
||||
|
||||
if (PyUnicode_GET_LENGTH(str) != writer->pos) {
|
||||
PyObject *str2;
|
||||
str2 = resize_compact(str, writer->pos);
|
||||
if (str2 == NULL) {
|
||||
Py_DECREF(str);
|
||||
return NULL;
|
||||
}
|
||||
str = str2;
|
||||
}
|
||||
|
||||
assert(_PyUnicode_CheckConsistency(str, 1));
|
||||
return unicode_result(str);
|
||||
}
|
||||
|
||||
|
||||
PyObject*
|
||||
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
|
||||
{
|
||||
PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
|
||||
assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
|
||||
_Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
|
||||
return str;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
|
||||
{
|
||||
Py_CLEAR(writer->buffer);
|
||||
}
|
||||
|
||||
#include "stringlib/unicode_format.h"
|
||||
|
||||
PyDoc_STRVAR(format__doc__,
|
||||
@@ -14456,7 +13847,7 @@ unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
|
||||
{
|
||||
PyObject *unicode;
|
||||
if (x == NULL) {
|
||||
unicode = unicode_get_empty();
|
||||
unicode = _PyUnicode_GetEmpty();
|
||||
}
|
||||
else if (encoding == NULL && errors == NULL) {
|
||||
unicode = PyObject_Str(x);
|
||||
@@ -14510,7 +13901,7 @@ unicode_vectorcall(PyObject *type, PyObject *const *args,
|
||||
return NULL;
|
||||
}
|
||||
if (nargs == 0) {
|
||||
return unicode_get_empty();
|
||||
return _PyUnicode_GetEmpty();
|
||||
}
|
||||
PyObject *object = args[0];
|
||||
if (nargs == 1) {
|
||||
@@ -15186,7 +14577,7 @@ unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
|
||||
if (it->it_seq != NULL) {
|
||||
return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
|
||||
} else {
|
||||
PyObject *u = unicode_get_empty();
|
||||
PyObject *u = _PyUnicode_GetEmpty();
|
||||
if (u == NULL) {
|
||||
Py_XDECREF(iter);
|
||||
return NULL;
|
||||
|
||||
@@ -167,6 +167,7 @@
|
||||
<ClCompile Include="..\Objects\unicode_format.c" />
|
||||
<ClCompile Include="..\Objects\unicodectype.c" />
|
||||
<ClCompile Include="..\Objects\unicode_formatter.c" />
|
||||
<ClCompile Include="..\Objects\unicode_writer.c" />
|
||||
<ClCompile Include="..\Objects\unicodeobject.c" />
|
||||
<ClCompile Include="..\Objects\unionobject.c" />
|
||||
<ClCompile Include="..\Objects\weakrefobject.c" />
|
||||
|
||||
@@ -490,6 +490,9 @@
|
||||
<ClCompile Include="..\Objects\unicode_formatter.c">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Objects\unicode_writer.c">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Objects\unicodeobject.c">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
||||
@@ -562,6 +562,7 @@
|
||||
<ClCompile Include="..\Objects\unicode_format.c" />
|
||||
<ClCompile Include="..\Objects\unicodectype.c" />
|
||||
<ClCompile Include="..\Objects\unicode_formatter.c" />
|
||||
<ClCompile Include="..\Objects\unicode_writer.c" />
|
||||
<ClCompile Include="..\Objects\unicodeobject.c" />
|
||||
<ClCompile Include="..\Objects\unionobject.c" />
|
||||
<ClCompile Include="..\Objects\weakrefobject.c" />
|
||||
|
||||
@@ -1283,6 +1283,9 @@
|
||||
<ClCompile Include="..\Objects\unicode_formatter.c">
|
||||
<Filter>Objects</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Objects\unicode_writer.c">
|
||||
<Filter>Objects</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Objects\unicodeobject.c">
|
||||
<Filter>Objects</Filter>
|
||||
</ClCompile>
|
||||
|
||||
Reference in New Issue
Block a user