Uh oh!
There was an error while loading. Please reload this page.
- Notifications
You must be signed in to change notification settings - Fork 33.9k
gh-120754: Refactor I/O modules to stash whole stat result rather than individual members#123412
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Uh oh!
There was an error while loading. Please reload this page.
Changes from all commits
508aa9d9d849cebfcfcf231226658f5cfe4d18a82dc55d10eFile filter
Filter by extension
Conversations
Uh oh!
There was an error while loading. Please reload this page.
Jump to
Uh oh!
There was an error while loading. Please reload this page.
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -242,14 +242,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, | ||
| buffering = -1 | ||
| line_buffering = True | ||
| if buffering < 0: | ||
| buffering = DEFAULT_BUFFER_SIZE | ||
| try: | ||
| bs = os.fstat(raw.fileno()).st_blksize | ||
| except (OSError, AttributeError): | ||
| pass | ||
| else: | ||
| if bs > 1: | ||
| buffering = bs | ||
| buffering = raw._blksize | ||
| if buffering < 0: | ||
| raise ValueError("invalid buffering size") | ||
| if buffering == 0: | ||
| @@ -1565,19 +1558,15 @@ def __init__(self, file, mode='r', closefd=True, opener=None): | ||
| os.set_inheritable(fd, False) | ||
| self._closefd = closefd | ||
| fdfstat = os.fstat(fd) | ||
| self._stat_atopen = os.fstat(fd) | ||
| try: | ||
| if stat.S_ISDIR(fdfstat.st_mode): | ||
| if stat.S_ISDIR(self._stat_atopen.st_mode): | ||
| raise IsADirectoryError(errno.EISDIR, | ||
| os.strerror(errno.EISDIR), file) | ||
| except AttributeError: | ||
| # Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR | ||
| # don't exist. | ||
| pass | ||
| self._blksize = getattr(fdfstat, 'st_blksize', 0) | ||
| if self._blksize <= 1: | ||
| self._blksize = DEFAULT_BUFFER_SIZE | ||
| self._estimated_size = fdfstat.st_size | ||
| if _setmode: | ||
| # don't translate newlines (\r\n <=> \n) | ||
| @@ -1623,6 +1612,17 @@ def __repr__(self): | ||
| return ('<%s name=%r mode=%r closefd=%r>' % | ||
| (class_name, name, self.mode, self._closefd)) | ||
| @property | ||
| def _blksize(self): | ||
| if self._stat_atopen is None: | ||
| return DEFAULT_BUFFER_SIZE | ||
| blksize = getattr(self._stat_atopen, "st_blksize", 0) | ||
| # WASI sets blsize to 0 | ||
| if not blksize: | ||
| return DEFAULT_BUFFER_SIZE | ||
| return blksize | ||
| def _checkReadable(self): | ||
| if not self._readable: | ||
| raise UnsupportedOperation('File not open for reading') | ||
| @@ -1655,16 +1655,20 @@ def readall(self): | ||
| """ | ||
| self._checkClosed() | ||
| self._checkReadable() | ||
| if self._estimated_size <= 0: | ||
| if self._stat_atopen is None or self._stat_atopen.st_size <= 0: | ||
| bufsize = DEFAULT_BUFFER_SIZE | ||
| else: | ||
| bufsize = self._estimated_size + 1 | ||
| # In order to detect end of file, need a read() of at least 1 | ||
| # byte which returns size 0. Oversize the buffer by 1 byte so the | ||
| # I/O can be completed with two read() calls (one for all data, one | ||
| # for EOF) without needing to resize the buffer. | ||
| bufsize = self._stat_atopen.st_size + 1 | ||
vstinner marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page. | ||
| if self._estimated_size > 65536: | ||
| if self._stat_atopen.st_size > 65536: | ||
| try: | ||
| pos = os.lseek(self._fd, 0, SEEK_CUR) | ||
| if self._estimated_size >= pos: | ||
| bufsize = self._estimated_size - pos + 1 | ||
| if self._stat_atopen.st_size >= pos: | ||
| bufsize = self._stat_atopen.st_size - pos + 1 | ||
| except OSError: | ||
| pass | ||
| @@ -1742,7 +1746,7 @@ def truncate(self, size=None): | ||
| if size is None: | ||
| size = self.tell() | ||
| os.ftruncate(self._fd, size) | ||
| self._estimated_size = size | ||
| self._stat_atopen = None | ||
| return size | ||
| def close(self): | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -74,8 +74,13 @@ typedef struct{ | ||
| signed int seekable : 2; /* -1 means unknown */ | ||
| unsigned int closefd : 1; | ||
| char finalizing; | ||
| unsigned int blksize; | ||
| Py_off_t estimated_size; | ||
| /* Stat result which was grabbed at file open, useful for optimizing common | ||
| File I/O patterns to be more efficient. This is only guidance / an | ||
| estimate, as it is subject to Time-Of-Check to Time-Of-Use (TOCTOU) | ||
| issues / bugs. Both the underlying file descriptor and file may be | ||
| modified outside of the fileio object / Python (ex. gh-90102, GH-121941, | ||
| gh-109523). */ | ||
| struct _Py_stat_struct *stat_atopen; | ||
vstinner marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page. | ||
| PyObject *weakreflist; | ||
| PyObject *dict; | ||
| } fileio; | ||
| @@ -199,8 +204,7 @@ fileio_new(PyTypeObject *type, PyObject *args, PyObject *kwds) | ||
| self->writable = 0; | ||
| self->appending = 0; | ||
| self->seekable = -1; | ||
| self->blksize = 0; | ||
| self->estimated_size = -1; | ||
| self->stat_atopen = NULL; | ||
| self->closefd = 1; | ||
| self->weakreflist = NULL; | ||
| } | ||
| @@ -256,7 +260,6 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, | ||
| #elif !defined(MS_WINDOWS) | ||
| int *atomic_flag_works = NULL; | ||
| #endif | ||
| struct _Py_stat_struct fdfstat; | ||
| int fstat_result; | ||
| int async_err = 0; | ||
| @@ -454,9 +457,13 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, | ||
| #endif | ||
| } | ||
| self->blksize = DEFAULT_BUFFER_SIZE; | ||
| self->stat_atopen = PyMem_New(struct _Py_stat_struct, 1); | ||
| if (self->stat_atopen == NULL){ | ||
| PyErr_NoMemory(); | ||
| goto error; | ||
| } | ||
| Py_BEGIN_ALLOW_THREADS | ||
| fstat_result = _Py_fstat_noraise(self->fd, &fdfstat); | ||
| fstat_result = _Py_fstat_noraise(self->fd, self->stat_atopen); | ||
| Py_END_ALLOW_THREADS | ||
| if (fstat_result < 0){ | ||
| /* Tolerate fstat() errors other than EBADF. See Issue #25717, where | ||
| @@ -471,25 +478,21 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, | ||
| #endif | ||
| goto error; | ||
| } | ||
| PyMem_Free(self->stat_atopen); | ||
| self->stat_atopen = NULL; | ||
| } | ||
| else{ | ||
| #if defined(S_ISDIR) && defined(EISDIR) | ||
| /* On Unix, open will succeed for directories. | ||
| In Python, there should be no file objects referring to | ||
| directories, so we need a check. */ | ||
| if (S_ISDIR(fdfstat.st_mode)){ | ||
| if (S_ISDIR(self->stat_atopen->st_mode)){ | ||
| errno = EISDIR; | ||
| PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, nameobj); | ||
| goto error; | ||
| } | ||
| #endif /* defined(S_ISDIR) */ | ||
| #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE | ||
| if (fdfstat.st_blksize > 1) | ||
| self->blksize = fdfstat.st_blksize; | ||
| #endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */ | ||
| if (fdfstat.st_size < PY_SSIZE_T_MAX){ | ||
| self->estimated_size = (Py_off_t)fdfstat.st_size; | ||
| } | ||
| } | ||
| #if defined(MS_WINDOWS) || defined(__CYGWIN__) | ||
| @@ -521,6 +524,10 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, | ||
| internal_close(self); | ||
| _PyErr_ChainExceptions1(exc); | ||
| } | ||
| if (self->stat_atopen != NULL){ | ||
| PyMem_Free(self->stat_atopen); | ||
| self->stat_atopen = NULL; | ||
| } | ||
| done: | ||
| #ifdef MS_WINDOWS | ||
| @@ -553,6 +560,10 @@ fileio_dealloc(fileio *self) | ||
| if (_PyIOBase_finalize((PyObject *) self) < 0) | ||
| return; | ||
| _PyObject_GC_UNTRACK(self); | ||
| if (self->stat_atopen != NULL){ | ||
| PyMem_Free(self->stat_atopen); | ||
| self->stat_atopen = NULL; | ||
| } | ||
| if (self->weakreflist != NULL) | ||
| PyObject_ClearWeakRefs((PyObject *) self); | ||
| (void)fileio_clear(self); | ||
| @@ -725,20 +736,27 @@ _io_FileIO_readall_impl(fileio *self) | ||
| return err_closed(); | ||
| } | ||
| end = self->estimated_size; | ||
| if (self->stat_atopen != NULL && self->stat_atopen->st_size < _PY_READ_MAX){ | ||
| end = (Py_off_t)self->stat_atopen->st_size; | ||
| } | ||
| else{ | ||
| end = -1; | ||
| } | ||
| if (end <= 0){ | ||
| /* Use a default size and resize as needed. */ | ||
| bufsize = SMALLCHUNK; | ||
| } | ||
| else{ | ||
| /* This is probably a real file, so we try to allocate a | ||
| buffer one byte larger than the rest of the file. If the | ||
| calculation is right then we should get EOF without having | ||
| to enlarge the buffer. */ | ||
| /* This is probably a real file. */ | ||
| if (end > _PY_READ_MAX - 1){ | ||
| bufsize = _PY_READ_MAX; | ||
| } | ||
| else{ | ||
| /* In order to detect end of file, need a read() of at | ||
| least 1 byte which returns size 0. Oversize the buffer | ||
| by 1 byte so the I/O can be completed with two read() | ||
| calls (one for all data, one for EOF) without needing | ||
| to resize the buffer. */ | ||
| bufsize = (size_t)end + 1; | ||
| } | ||
| @@ -1094,11 +1112,13 @@ _io_FileIO_truncate_impl(fileio *self, PyTypeObject *cls, PyObject *posobj) | ||
| return NULL; | ||
| } | ||
| /* Sometimes a large file is truncated. While estimated_size is used as a | ||
| estimate, that it is much larger than the actual size can result in a | ||
| significant over allocation and sometimes a MemoryError / running out of | ||
| memory. */ | ||
| self->estimated_size = pos; | ||
| /* Since the file was truncated, its size at open is no longer accurate | ||
| as an estimate. Clear out the stat result, and rely on dynamic resize | ||
vstinner marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page. | ||
| code if a readall is requested. */ | ||
| if (self->stat_atopen != NULL){ | ||
| PyMem_Free(self->stat_atopen); | ||
| self->stat_atopen = NULL; | ||
| } | ||
| return posobj; | ||
| } | ||
| @@ -1229,16 +1249,27 @@ get_mode(fileio *self, void *closure) | ||
| return PyUnicode_FromString(mode_string(self)); | ||
| } | ||
| static PyObject * | ||
| get_blksize(fileio *self, void *closure) | ||
| { | ||
| #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE | ||
| if (self->stat_atopen != NULL && self->stat_atopen->st_blksize > 1){ | ||
Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do wonder how realistic the st_blksize values, when available, are for performance purposes, I guess we'll find out. Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This PR should not change the buffer size, does it? ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #117151 (comment) investigated Have with the refactors + optimizations been watching for new issues. Are finding some as people test | ||
| return PyLong_FromLong(self->stat_atopen->st_blksize); | ||
| } | ||
| #endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */ | ||
| return PyLong_FromLong(DEFAULT_BUFFER_SIZE); | ||
| } | ||
| static PyGetSetDef fileio_getsetlist[] ={ | ||
| {"closed", (getter)get_closed, NULL, "True if the file is closed"}, | ||
| {"closefd", (getter)get_closefd, NULL, | ||
| "True if the file descriptor will be closed by close()."}, | ||
| {"mode", (getter)get_mode, NULL, "String giving the file mode"}, | ||
| {"_blksize", (getter)get_blksize, NULL, "Stat st_blksize if available"}, | ||
| {NULL}, | ||
| }; | ||
| static PyMemberDef fileio_members[] ={ | ||
| {"_blksize", Py_T_UINT, offsetof(fileio, blksize), 0}, | ||
| {"_finalizing", Py_T_BOOL, offsetof(fileio, finalizing), 0}, | ||
| {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(fileio, weakreflist), Py_READONLY}, | ||
| {"__dictoffset__", Py_T_PYSSIZET, offsetof(fileio, dict), Py_READONLY}, | ||
Uh oh!
There was an error while loading. Please reload this page.