Python 源代码的执行流程大概如下图所示:

源代码被转换(编译)成字节码(bytecode),然后在 Python 解释器上下文对字节码进行解释执行

pyc

通过 py_compile 模块可以将 Python 源文件编译成 pyc 格式的文件,实际上就是 Python 解释器可加载的字节码二进制文件格式(类似于 Java 的 class 文件)。

1
2
3
4
5
6
7
8
9
10
11
12
$ ls
add.py
$ python2.7 -m py_compile add.py
$ ls
add.py add.pyc
$ python3.5 -m py_compile add.py
$ python3.6 -m py_compile add.py
$ python3.7 -m py_compile add.py
$ ls
add.py add.pyc __pycache__
$ ls __pycache__/
add.cpython-35.pyc add.cpython-36.pyc add.cpython-37.pyc

pyc 文件的格式随着 Python 版本的变化可能变化,因此解析时需要根据文件头中前四个字节所标示的 magic 值进行判断:

Python < 3.3 pyc 文件头只有 magictimestamp(源文件修改时间)两个字段;

Python >= 3.3 pyc 文件头在 timestamp 之后新增了 size(源文件字节数)字段;

Python >= 3.7 pyc 文件头在 magic 之后新增了 hash(源文件 hash 标示)字段;

注意 hash 只是一个 flag 字段,当该字段不为 0 时,则改变 timestamp 字段的用途为源文件的哈希值(PEP 552)。

一个简单的解析 pyc 文件的脚本如下(参考了 Ned Batchelder, Eric Snowamedama 的代码):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import binascii
import dis
import marshal
import struct
import sys
import time
import types

try:
from StringIO import StringIO
except ImportError:
from io import StringIO


INDENT = ' ' * 4
MAX_HEX_LEN = 16
NAME_OFFSET = 20


def hexdump(bytes_value, level=0, wrap=True):
indent = INDENT * level
line = ' '.join(('%02x',) * MAX_HEX_LEN)
last = ' '.join(('%02x',) * (len(bytes_value) % MAX_HEX_LEN))
lines = (line,) * (len(bytes_value) // MAX_HEX_LEN)
if last:
lines += (last,)
if wrap:
template = indent + ('\n' + indent).join(lines)
else:
template = ' '.join(lines)
try:
return template % tuple(bytes_value)
except TypeError:
return template % tuple(ord(char) for char in bytes_value)


def show_consts(consts, level=0):
indent = INDENT * level
i = 0
for const in consts:
if isinstance(const, types.CodeType):
print('%s%s (code object)' % (indent, i))
show_code(const, level=level+1)
else:
print('%s%s %r' % (indent, i, const))
i += 1


def show_bytecode(code, level=0):
indent = INDENT * level
print(hexdump(code.co_code, level, wrap=True))
print('%sdisassembled:' % indent)
buffer = StringIO()
sys.stdout = buffer
dis.disassemble(code)
sys.stdout = sys.__stdout__
print(indent + buffer.getvalue().replace('\n', '\n'+indent))


def show_code(code, level=0):
indent = INDENT * level
for name in dir(code):
if not name.startswith('co_'):
continue
if name in ('co_code', 'co_consts'):
continue
value = getattr(code, name)
if isinstance(value, str):
value = repr(value)
elif name == 'co_flags':
value = '0x%05x' % value
elif name == 'co_lnotab':
value = '0x(%s)' % hexdump(value)
print('%s%s%s' % (indent, (name + ':').ljust(NAME_OFFSET), value))
print('%sco_consts' % indent)
show_consts(code.co_consts, level=level+1)
print('%sco_code' % indent)
show_bytecode(code, level=level+1)


def show_file(fname):

def next_state(curr):
if curr == 'init':
return 'try_size'
if curr == 'try_size':
return 'try_hash'
return 'error'

with open(fname, 'rb') as f:
code = None
state = 'init'
while state != 'done':
magic = f.read(4)
timestamp = None
size = None
if state == 'init':
raw_timestamp = f.read(4)
timestamp = time.asctime(time.localtime(struct.unpack('=L', raw_timestamp)[0]))
if state == 'try_size':
raw_timestamp = f.read(4)
timestamp = time.asctime(time.localtime(struct.unpack('=L', raw_timestamp)[0]))
raw_size = f.read(4)
size = struct.unpack('=L', raw_size)[0]
if state == 'try_hash':
hashf = struct.unpack('=L', f.read(4))
raw_timestamp = f.read(4)
if not hashf:
timestamp = time.asctime(time.localtime(struct.unpack('=L', raw_timestamp)[0]))
raw_size = f.read(4)
size = struct.unpack('=L', raw_size)[0]

try:
code = marshal.loads(f.read())
if not isinstance(code, types.CodeType):
f.seek(0, 0)
state = next_state(state)
if state == 'error':
raise Exception('Could not parse pyc file')
continue
except ValueError:
f.seek(0, 0)
state = next_state(state)
if state == 'error':
raise
continue
except:
raise

print('magic %s' % (binascii.hexlify(magic)))
if timestamp:
print('timestamp %s (%s)' % (binascii.hexlify(raw_timestamp), timestamp))
if size:
print('size %s (%s)' % (binascii.hexlify(raw_size), size))
state = 'done'

print('code')
show_code(code)


if __name__ == '__main__':
if len(sys.argv) == 2:
show_file(sys.argv[1])

需要注意的是,在这个脚本的第一行并没有加上 bash shebang,即 #! /usr/bin/python 的字样,这是因为解析 Python2 的 pyc 文件需要使用 Python2 解释器,而解析 Python3 的 pyc 文件需要使用 Python3 解释器(因为不同版本的内置 marshal 模块对 code 对象不兼容)。

脚本输出中的 co_names, co_consts 等字段的含义可以参考 Python The standard type hierarchy Internal types 一节。

bytecode

上面提到 pyc 是 Python 字节码的二进制文件格式,我们可以使用 dis) 模块直接观察字节码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
$ python3 -m dis add.py
1 0 LOAD_CONST 0 (<code object add at 0x555f9b86dab0, file "add.py", line 1>)
2 LOAD_CONST 1 ('add')
4 MAKE_FUNCTION 0
6 STORE_NAME 0 (add)

4 8 LOAD_CONST 2 (1)
10 STORE_NAME 1 (a)

5 12 LOAD_CONST 3 (2)
14 STORE_NAME 2 (b)

6 16 LOAD_NAME 0 (add)
18 LOAD_NAME 1 (a)
20 LOAD_NAME 2 (b)
22 LOAD_CONST 4 (3)
24 CALL_FUNCTION 3
26 STORE_NAME 3 (sum)

7 28 LOAD_NAME 4 (print)
30 LOAD_NAME 3 (sum)
32 CALL_FUNCTION 1
34 POP_TOP
36 LOAD_CONST 5 (None)
38 RETURN_VALUE

Disassembly of <code object add at 0x555f9b86dab0, file "add.py", line 1>:
2 0 LOAD_FAST 0 (a)
2 LOAD_FAST 1 (b)
4 BINARY_ADD
6 LOAD_FAST 2 (c)
8 BINARY_ADD
10 RETURN_VALUE

其中第一列为源代码所在行号,第二列为指令在字节码中的偏移(即上面脚本输出中的 co_code 字段),第三列为指令名,第四列对于 LOAD_CONST, LOAD_NAME, LOAD_GLOBAL 等指令为指令参数在 co_consts, co_names tuple 中的索引,对于 CALL_FUNCTION 指令则表示需要从数据栈(evaluation stack)取指定个数的参数(具体的指令及其参数可以参考 Python Bytecode Instructions),第五列为第四列索引所对应的名字(如果有的话),结合 pyc 文件的解析,从源代码到字节码的转换应该是比较容易理解的。

指令及其二进制表示可以通过如下方式进行查看:

1
2
3
4
5
6
7
>>> import dis
>>> dis.opmap['LOAD_CONST']
100
>>> dis.opname[100]
'LOAD_CONST'
>>> dis.opname[0x7c]
'LOAD_FAST'

除了通过 py_compile 模块直接生成 pyc 文件,然后通过 dis 模块解析来分析字节码之外,还可以在 Python 的交互终端中进行实验:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
>>> import dis
>>> g = 3
>>> def x(a, b):
... global g
... a += 1
... b += 2
... g += 3
... return a + b + g
...
>>> dir(x.__code__)
['__class__', '__cmp__', '__delattr__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'co_argcount', 'co_cellvars', 'co_code', 'co_consts', 'co_filename', 'co_firstlineno', 'co_flags', 'co_freevars', 'co_lnotab', 'co_name', 'co_names', 'co_nlocals', 'co_stacksize', 'co_varnames']
>>> dis.disassemble(x.__code__)
3 0 LOAD_FAST 0 (a)
3 LOAD_CONST 1 (1)
6 INPLACE_ADD
7 STORE_FAST 0 (a)

4 10 LOAD_FAST 1 (b)
13 LOAD_CONST 2 (2)
16 INPLACE_ADD
17 STORE_FAST 1 (b)

5 20 LOAD_GLOBAL 0 (g)
23 LOAD_CONST 3 (3)
26 INPLACE_ADD
27 STORE_GLOBAL 0 (g)

6 30 LOAD_FAST 0 (a)
33 LOAD_FAST 1 (b)
36 BINARY_ADD
37 LOAD_GLOBAL 0 (g)
40 BINARY_ADD
41 RETURN_VALUE
>>> dis.dis(x)
3 0 LOAD_FAST 0 (a)
3 LOAD_CONST 1 (1)
6 INPLACE_ADD
7 STORE_FAST 0 (a)

4 10 LOAD_FAST 1 (b)
13 LOAD_CONST 2 (2)
16 INPLACE_ADD
17 STORE_FAST 1 (b)

5 20 LOAD_GLOBAL 0 (g)
23 LOAD_CONST 3 (3)
26 INPLACE_ADD
27 STORE_GLOBAL 0 (g)

6 30 LOAD_FAST 0 (a)
33 LOAD_FAST 1 (b)
36 BINARY_ADD
37 LOAD_GLOBAL 0 (g)
40 BINARY_ADD
41 RETURN_VALUE
>>> code = compile('r = x(4, 5);print(r)', '<raw string>', 'exec')
>>> exec(code)
18
>>> eval(code)
21
>>> dir(code)
['__class__', '__cmp__', '__delattr__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'co_argcount', 'co_cellvars', 'co_code', 'co_consts', 'co_filename', 'co_firstlineno', 'co_flags', 'co_freevars', 'co_lnotab', 'co_name', 'co_names', 'co_nlocals', 'co_stacksize', 'co_varnames']
>>> dis.disassemble(code)
1 0 LOAD_NAME 0 (x)
3 LOAD_CONST 0 (4)
6 LOAD_CONST 1 (5)
9 CALL_FUNCTION 2
12 STORE_NAME 1 (r)
15 LOAD_NAME 1 (r)
18 PRINT_ITEM
19 PRINT_NEWLINE
20 LOAD_CONST 2 (None)
23 RETURN_VALUE
>>> dis.dis(code)
1 0 LOAD_NAME 0 (x)
3 LOAD_CONST 0 (4)
6 LOAD_CONST 1 (5)
9 CALL_FUNCTION 2
12 STORE_NAME 1 (r)
15 LOAD_NAME 1 (r)
18 PRINT_ITEM
19 PRINT_NEWLINE
20 LOAD_CONST 2 (None)
23 RETURN_VALUE
>>>

参考资料

An introduction to Python bytecode

https://opensource.com/article/18/4/introduction-python-bytecode

Types and Objects in Python

http://www.informit.com/articles/article.aspx?p=453682&seqNum=5

.pyc file format of Python: format specification

http://formats.kaitai.io/python_pyc_27/index.html

The structure of .pyc files

https://nedbatchelder.com/blog/200804/the_structure_of_pyc_files.html

A Python Interpreter Written in Python

http://www.aosabook.org/en/500L/a-python-interpreter-written-in-python.html

Computed goto for efficient dispatch tables

https://eli.thegreenplace.net/2012/07/12/computed-goto-for-efficient-dispatch-tables

A cross-version Python bytecode decompiler

https://github.com/rocky/python-uncompyle6/

The standard type hierarchy - Internal types

https://docs.python.org/2.7/reference/datamodel.html#the-standard-type-hierarchy

marshal - Internal Python object serialization

https://docs.python.org/2.7/library/marshal.html