Cython Reverse

Cython Reverse notes

Cython 是什么呢?官方给出的解释是 Cython 是一种新的语言,是带类型的 Python [1]。

Cython 有一个功能叫 Cythonize 这个功能可以将 Python 编译成 C语言实现,再由 GCC/Clang 将 C 编译成动态库。

setup.py

1
2
3
4
5
6
from setuptools import setup
from Cython.Build import cythonize

setup(
ext_modules = cythonize("hello.pyx", annotate=True)
)

annotate=True 选项可以生成一个 html 页面,用于显示 Py源码与生成的C代码的对应关系。

build 命令

1
python setup.py build_ext --inplace

Build 成功后,可以在当前目录下找到对应的动态库与对应的 xxx.c 源文件。此时可以直接用 python import 导入(就像导入编译之前的那样)

Cythonize 好处是编译成本地代码后执行效率可以提高,缺点是不太好逆向

本文分析基于 Python 3.7.6,Cython 0.29.21

本文基于 Hello.py 编译成的动态库分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import datetime
import math

def myfunc1():
print("This is myfunc1.")

def test_variables():
x = 5
y = "variables test."
print(x)
print(y)

def test_strVar():
x = "Hello world."
print(x)

def test_global_var():
global gy
print(gy)


def test_cast():
x = int(5)
y = str(3)
print(x, y)

def test_numbers():
x = 123
y = 12.3
z = 0x112233445566778899AABBCCDD
print(x, y, z)


def test_if(x):
if x > 456:
print("x > 456")
else:
print("x <= 456")

def test_string():
x = "I am str."
y = len(x)
z = x[1]
w = x[2:]
print(x, y, z, w)

if "am" in x:
print("yes")
else:
print("wrong")

def test_list():
x = list()
x.append(1)
x.append(2)
x.append(3)
x.append(4)
x.append("five")
print(x)
print(len(x))
for i in x:
print(i)
x = x[1:]
x[2:4] = [22, 33]

def test_dict():
x = {}
x["one"] = 1
x["two"] = 2
x["three"] = 3
y = x["one"]
z = x["two"]
if "one" in x:
print(y)

for k in x:
print(k, x[k])

def test_for():
s = 0
for i in range(101):
s = s + i
print(s)

def test_while():
s = 0
i = 1
while i <= 100:
s = s + i
i += 1
print(s)

def test_exception():
x = 1
try:
x = x + "1"
print(x)
except NameError:
print("Variable x is not defined")
except:
print("Something else went wrong")

def test_datetime():
x = datetime.datetime.now()
print(x)

def test_format():
x = 1
y = "One"
z = "%s is %d" % (y, x)
print(z)

def test_math():
x = math.ceil(1.4)
y = math.floor(1.4)

print(x) # returns 2
print(y) # returns 1

def test_arg(x, y, z):
x = x + 1
y = y + "2"
z = z[:]
print(x, y, z)


class test_class:
def __init__(self):
self.aa = 1

def test_class_hh(self):
print(self.aa)



gy = 123
myfunc1()
test_variables()
test_strVar()
test_global_var()
test_cast()
test_numbers()
test_string()
test_list()
test_dict()
test_for()
test_while()
test_exception()
test_datetime()
test_format()
test_math()
test_arg(1, "2", [4, 5, 6])

字符串表 __pyx_string_tab

字符串表算是 Cython 动态库逆向非常重要的一个入口了,字符串表记录 Python 代码中引用的字符串。

在 hello.c 源文件中可以找到 __pyx_string_tab

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
typedef struct {PyObject **p; 
const char *s;
const Py_ssize_t n;
const char* encoding;
const char is_unicode;
const char is_str;
const char intern;
} __Pyx_StringTabEntry;

static PyObject *__pyx_kp_s_1;
static PyObject *__pyx_kp_s_Hello_world;

static const char __pyx_k_1[] = "1";
static const char __pyx_k_Hello_world[] = "Hello world.";


static __Pyx_StringTabEntry __pyx_string_tab[] = {
{&__pyx_kp_s_1, __pyx_k_1, sizeof(__pyx_k_1), 0, 0, 1, 0},
{&__pyx_kp_s_Hello_world, __pyx_k_Hello_world, sizeof(__pyx_k_Hello_world), 0, 0, 1, 0},
{&__pyx_kp_s_I_am_str, __pyx_k_I_am_str, sizeof(__pyx_k_I_am_str), 0, 0, 1, 0},
{&__pyx_n_s_NameError, __pyx_k_NameError, sizeof(__pyx_k_NameError), 0, 0, 1, 1},
......
{0, 0, 0, 0, 0, 0, 0}
}

__Pyx_InitStrings(__pyx_string_tab)

__pyx_string_tabPyObject 与对应的字符串关联起来,最后调用 __Pyx_InitStrings 初始化所有字符串对象。

在 IDA 逆向分析时,查找字符串引用,可以快速定位到__pyx_string_tab ,然后根据 __Pyx_StringTabEntry 定位程序对某个字符串 PyObject 的引用。

整数常量的构造

整数在 Cython 里面也是 PyObject,整数常量属于全局变量,在 __Pyx_InitGlobals 中初始化

1
2
3
4
5
6
7
8
9
10
11
12
13
__pyx_float_1_4 = PyFloat_FromDouble(1.4); 
__pyx_int_0 = PyInt_FromLong(0);
__pyx_int_1 = PyInt_FromLong(1);
__pyx_int_2 = PyInt_FromLong(2);
__pyx_int_3 = PyInt_FromLong(3);
__pyx_int_4 = PyInt_FromLong(4);
__pyx_int_5 = PyInt_FromLong(5);
__pyx_int_6 = PyInt_FromLong(6);
__pyx_int_22 = PyInt_FromLong(22);
__pyx_int_33 = PyInt_FromLong(33);
__pyx_int_100 = PyInt_FromLong(100);
__pyx_int_123 = PyInt_FromLong(123);
__pyx_int_1357463230989497419223659171037 = PyInt_FromString((char *)"1357463230989497419223659171037", 0, 0);

函数声明

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static PyObject *__pyx_pf_5hello_myfunc1(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_2test_variables(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_4test_strVar(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_6test_global_var(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_8test_cast(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_10test_numbers(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_12test_if(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_14test_string(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_16test_list(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_18test_dict(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_20test_for(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_22test_while(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_24test_exception(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_26test_datetime(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_28test_format(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_30test_math(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static PyObject *__pyx_pf_5hello_32test_arg(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_x, PyObject *__pyx_v_y, PyObject *__pyx_v_z); /* proto */
static PyObject *__pyx_pf_5hello_10test_class___init__(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self); /* proto */
static PyObject *__pyx_pf_5hello_10test_class_2test_class_hh(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self); /* proto */

重点看一下 __pyx_pf_5hello_32test_arg__pyx_pf_5hello_10test_class_2test_class_hh

__pyx_pf_5hello_32test_arg 的 py 定义是 def test_arg(x, y, z) 因此多了几个参数。

__pyx_pf_5hello_10test_class_2test_class_hh 是类函数 test_class.test_class_hh , 所以有一个 self 参数。

myfunc1() 分析

1
2
3
4
5
PyObject *__cdecl PyObject_Call(PyObject *callable_object, PyObject *args, PyObject *kw);

v0 = PyTuple_Pack(1LL, __pyx_kp_s_This_is_myfunc1); // args
v2 = PyObject_GetAttr(__pyx_b, __pyx_n_s_print) // __pyx_b -> PyImport_AddModule("builtins");
PyObject_Call(v2, v1, 0LL)

test_variables() 分析

1
2
def myfunc1():
print("This is myfunc1.")
1
2
3
4
5
6
7
v0 = __pyx_kp_s_variables_test; // 引用字符串
x = PyLong_FromLong(5LL);

// 调用 print
args = PyTuple_Pack(1LL, x);
v6 = PyObject_GetAttr(__pyx_b, __pyx_n_s_print);
PyObject_Call(v6, args, 0LL);

test_global_var() 分析

1
2
3
def test_global_var():
global gy
print(gy)
1
2
3
4
5
6
v0 = PyObject_GetAttr(__pyx_b, __pyx_n_s_gy); // 从当前模块查找globals

// 调用 print
v1 = PyTuple_Pack(1LL, v0);
v3 = PyObject_GetAttr(__pyx_b, __pyx_n_s_print);
PyObject_Call(v3, v2, 0LL);

test_numbers() 分析

1
2
3
4
5
def test_numbers():
x = 123
y = 12.3
z = 0x112233445566778899AABBCCDD
print(x, y, z)

print 三个参数,构造元组。
decompiler 识别出来的数据结构有问题

1
2
3
4
5
mov     edi, 3          ; size
call _PyTuple_New
mov [rax+18h], r12 ; 123
mov [rax+20h], r15 ; 12.3
mov [rax+28h], r14 ; 0x112233445566778899AABBCCDD

这段代码也可以看出 Tuple 构造方法

test_if() 分析

1
2
3
4
5
def test_if(x):
if x > 456:
print("x > 456")
else:
print("x <= 456")
1
2
3
4
5
6
7
8
v2 = PyObject_RichCompare(a2, __pyx_int_456, 4);
v5 = PyObject_IsTrue(v2);
if ( v5 )
goto LABEL_8;
goto LABEL_13;

LABEL_8 => 输出 x > 456
LABEL_13 => 输出 x <= 456

test_for() 分析

1
2
3
4
5
def test_for():
s = 0
for i in range(101):
s = s + i
print(s)
1
2
3
4
5
6
7
8
9
10
11
v3 = PyLong_FromLong(0LL);
v0 = __pyx_int_0;
v2 = 0
while ( 1 ) {
v0 = PyNumber_Add(v0, v3);
++v2;
if ( v2 >= 101 )
break;
v3 = PyLong_FromLong(v2);
}
..... print 代码忽略 ......

range(101) 直接翻译成等价最优形式了。

test_string() 分析

1
2
3
4
5
6
7
8
9
10
11
def test_string():
x = "I am str."
y = len(x)
z = x[1]
w = x[2:]
print(x, y, z, w)

if "am" in x:
print("yes")
else:
print("wrong")

下标访问 z = x[1]

1
2
3
4
5
6
7
v0 = __pyx_kp_s_I_am_str;

item = (v4->sq_item)(v0, 1LL); // sq_item 访问下标 PyObject* PySequence_GetItem(PyObject *o, Py_ssize_t i)
或者
v31 = PyLong_FromSsize_t(1LL);
item = PyObject_GetItem(v0, v31);

切片访问 w = x[2:]

1
2
3
v0 = __pyx_kp_s_I_am_str;
__pyx_slice__2 = PySlice_New(__pyx_int_2, &_Py_NoneStruct, &_Py_NoneStruct);
v6 = (v0->ob_type->tp_as_mapping->mp_subscript)(v0, __pyx_slice__2);

in 关键字

1
v10 = PySequence_Contains(v1, __pyx_n_s_am);

test_list() 分析

1
2
3
4
5
6
7
8
9
10
11
12
13
def test_list():
x = list()
x.append(1)
x.append(2)
x.append(3)
x.append(4)
x.append("five")
print(x)
print(len(x))
for i in x:
print(i)
x = x[1:]
x[2:4] = [22, 33]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
v0 = PyList_New(0LL);
PyList_Append(v1, __pyx_int_1);
PyList_Append(v1, __pyx_int_2);
......
PyList_Append(v1, __pyx_n_s_five);

; for i in x:
; print(i)
mov rax, [rbx+18h] // 列表开始
mov r15, [rax+r12*8] // r12 下标

; x = x[1:]
call _PyList_New
mov rcx, rax
mov rax, [rbp+var_38] ; var_38 是原来的 x
mov r12, [rax+18h] ; 第一个 PyObect * 地址
lea rsi, [r12+8] ; [1:] 跳过一个元素
mov rdi, [rcx+18h] ; __dst
lea r15, [r13-1]
lea rdx, ds:0[r15*8] ; __n r15 复制的元素的个数
call _memcpy

;x[2:4] = [22, 33]
;构造 [22, 33]
mov edi, 2
call _PyList_New
mov r13, rax
mov rax, cs:___pyx_int_22
mov rcx, [r13+18h]
mov [rcx], rax
mov rax, cs:___pyx_int_33
mov rcx, [r13+18h]
mov [rcx+8], rax
mov [rbp+var_40], rcx
; x[2:4] =

;创建切片
mov edi, 2 ; r12 = _PyLong_FromSsize_t(2)
call _PyLong_FromSsize_t
mov r12, rax
mov edi, 4 ; r15 = _PyLong_FromSsize_t(4)
call _PyLong_FromSsize_t
mov r15, rax
mov r13, cs:__Py_NoneStruct_ptr
mov rdi, r12 ; start: 2
mov rsi, rax ; stop : 4
mov rdx, r13 ; step __Py_NoneStruct_ptr
call _PySlice_New
mov [rbp+var_48], rax

;给切片赋值
mov r12, [rbp+var_48]
mov rdi, rbx ;o
mov rsi, r12 ;key, 切片
mov rdx, [rbp+var_38] ;v
mov rax, [rbp+var_40]
call qword ptr [rax+10h] ;mp_ass_subscript int PyObject_SetItem(PyObject *o, PyObject *key, PyObject *v)



test_dict() 分析

1
2
3
4
5
6
7
8
9
10
11
12
def test_dict():
x = {}
x["one"] = 1
x["two"] = 2
x["three"] = 3
y = x["one"]
z = x["two"]
if "one" in x:
print(y)

for k in x:
print(k, x[k])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
; x = {}
call _PyDict_New
mov rbx, rax ; rbx => x PyObject *

; x["one"] = 1
mov rsi, cs:___pyx_n_s_one ; key
mov rdx, cs:___pyx_int_1 ; v
mov rdi, rbx ; o
call _PyDict_SetItem ;int PyObject_SetItem(PyObject *o, PyObject *key, PyObject *v)

; x["two"] = 2
mov rsi, cs:___pyx_n_s_two ; key
mov rdx, cs:___pyx_int_2 ; item
mov rdi, rbx ; mp
call _PyDict_SetItem

; x["three"] = 3
mov rsi, cs:___pyx_n_s_three ; key
mov rdx, cs:___pyx_int_3 ; item
mov rdi, rbx ; mp
call _PyDict_SetItem

; y = x["one"]
mov rsi, cs:___pyx_n_s_one
mov rdi, rbx
call ___Pyx_PyDict_GetItem
mov [rbp+var_40], rax

; z = x["two"]
mov rsi, cs:___pyx_n_s_two
mov rdi, rbx
call ___Pyx_PyDict_GetItem
mov [rbp+var_38], rax

; if "one" in x
mov rsi, cs:___pyx_n_s_one ; key
mov rdi, rbx ; mp
call _PyDict_Contains ; key in mp retrun 1, not in return 0, error: -1


; for k in x
loop_start:
mov rdi, rbx ; o
lea rsi, [rbp+pos] ; pos
lea rdx, [rbp+key] ; key
lea rcx, [rbp+value] ; value
call _PyDict_Next ; int PyDict_Next(PyObject *p, Py_ssize_t *ppos, PyObject **pkey, PyObject **pvalue)
.......
call ___Pyx_PyDict_GetItem
.......
call _PyTuple_New
.....
call _PyTuple_Pack
....
call print.
.....
call _PyDict_Size
......

test_datetime () 分析

1
2
3
def test_datetime():
x = datetime.datetime.now()
print(x)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
mov     rbx, cs:___pyx_n_s_datetime
mov rdx, [rbx+18h]
mov rsi, rbx
call __PyDict_GetItem_KnownHash
mov r15, rax

mov rsi, cs:___pyx_n_s_datetime
mov rdi, r15 ; PyObject *
call _PyObject_GetAttr
mov rbx, rax

mov rsi, cs:___pyx_n_s_now ;now
mov rdi, rbx
call _PyObject_GetAttr
mov r15, rax

mov rsi, cs:___pyx_empty_tuple
mov rdi, r15 ; callable_object
call ___Pyx_PyObject_Call

........ print(x) 略 ........

test_format() 分析

1
2
3
4
5
def test_format():
x = 1
y = "One"
z = "%s is %d" % (y, x)
print(z)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
mov     r14, cs:___pyx_n_s_One
mov edi, 1 ; __int64
call _PyLong_FromLong
mov r15, rax ; r15 = _PyLong_FromLong(1)
mov edi, 2 ; size
call _PyTuple_New
mov rbx, rax
mov [rax+18h], r14 ; "One"
mov [rax+20h], r15 ; 1

mov rdi, cs:___pyx_kp_s_s_is_d ; %s is %d
mov rsi, rax ; Tuple("One", 1)
call _PyUnicode_Format

dump PyObject *

1
2
3
4
5
.text:00007FFBC501B615 mov     rcx, rbx
.text:00007FFBC501B618 call python38_PyObject_Str
.text:00007FFBC501B618
.text:00007FFBC501B61D mov rcx, rax
.text:00007FFBC501B620 call python38_PyUnicode_AsUTF8

这样就可以得到 PyObject * 的字符串 dump 信息。

1
2
3
4
5
6
7
8
9
10
11
12
13
// frida -pid xxx
function DumpPyObject(address) {
if (Process.arch === "x64") {
var native_address = new NativePointer(address);
var module = Process.findModuleByName("python38.dll");
var PyObject_Str = new NativeFunction(module.findExportByName("PyObject_Str"), "pointer", ["pointer"], "win64");
var PyUnicode_AsUTF8 = new NativeFunction(module.findExportByName("PyUnicode_AsUTF8"), "pointer", ["pointer"], "win64");
var obj = PyObject_Str(native_address);
var p = PyUnicode_AsUTF8(obj);
console.log("okkk..");
console.log(p.readUtf8String());
}
}

参考文章

[1] “The Basics of Cython” https://cython.readthedocs.io/en/latest/src/tutorial/cython_tutorial.html

[2] “Object Protocol” https://docs.python.org/3/c-api/object.html

[3] “Type Objects” https://docs.python.org/3/c-api/typeobj.html#c.PySequenceMethods.sq_item


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!