Cython Reverse notes Cython 是什么呢?官方给出的解释是 Cython 是一种新的语言,是带类型的 Python [1]。
Cython 有一个功能叫 Cythonize
这个功能可以将 Python 编译成 C语言实现,再由 GCC/Clang 将 C 编译成动态库。
setup.py
1 2 3 4 5 6 from setuptools import setupfrom Cython.Build import cythonize setup( ext_modules = cythonize("hello.pyx" , annotate=True ) )
annotate=True 选项可以生成一个 html 页面,用于显示 Py源码与生成的C代码的对应关系。
build 命令
1 python setup.py build_ext --inplace
Build 成功后,可以在当前目录下找到对应的动态库与对应的 xxx.c 源文件。此时可以直接用 python import 导入(就像导入编译之前的那样)
Cythonize 好处是编译成本地代码后执行效率可以提高,缺点是不太好逆向
本文分析基于 Python 3.7.6,Cython 0.29.21
本文基于 Hello.py 编译成的动态库分析
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 import datetimeimport mathdef myfunc1 (): print("This is myfunc1." )def test_variables (): x = 5 y = "variables test." print(x) print(y)def test_strVar (): x = "Hello world." print(x)def test_global_var (): global gy print(gy)def test_cast (): x = int (5 ) y = str (3 ) print(x, y)def test_numbers (): x = 123 y = 12.3 z = 0x112233445566778899AABBCCDD print(x, y, z)def test_if (x ): if x > 456 : print("x > 456" ) else : print("x <= 456" )def test_string (): x = "I am str." y = len (x) z = x[1 ] w = x[2 :] print(x, y, z, w) if "am" in x: print("yes" ) else : print("wrong" )def test_list (): x = list () x.append(1 ) x.append(2 ) x.append(3 ) x.append(4 ) x.append("five" ) print(x) print(len (x)) for i in x: print(i) x = x[1 :] x[2 :4 ] = [22 , 33 ]def test_dict (): x = {} x["one" ] = 1 x["two" ] = 2 x["three" ] = 3 y = x["one" ] z = x["two" ] if "one" in x: print(y) for k in x: print(k, x[k])def test_for (): s = 0 for i in range (101 ): s = s + i print(s)def test_while (): s = 0 i = 1 while i <= 100 : s = s + i i += 1 print(s)def test_exception (): x = 1 try : x = x + "1" print(x) except NameError: print("Variable x is not defined" ) except : print("Something else went wrong" )def test_datetime (): x = datetime.datetime.now() print(x)def test_format (): x = 1 y = "One" z = "%s is %d" % (y, x) print(z)def test_math (): x = math.ceil(1.4 ) y = math.floor(1.4 ) print(x) print(y) def test_arg (x, y, z ): x = x + 1 y = y + "2" z = z[:] print(x, y, z)class test_class : def __init__ (self ): self.aa = 1 def test_class_hh (self ): print(self.aa) gy = 123 myfunc1() test_variables() test_strVar() test_global_var() test_cast() test_numbers() test_string() test_list() test_dict() test_for() test_while() test_exception() test_datetime() test_format() test_math() test_arg(1 , "2" , [4 , 5 , 6 ])
字符串表 __pyx_string_tab 字符串表算是 Cython 动态库逆向非常重要的一个入口了,字符串表记录 Python 代码中引用的字符串。
在 hello.c 源文件中可以找到 __pyx_string_tab
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 typedef struct { PyObject **p; const char *s; const Py_ssize_t n; const char * encoding; const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;static PyObject *__pyx_kp_s_1;static PyObject *__pyx_kp_s_Hello_world;static const char __pyx_k_1[] = "1" ;static const char __pyx_k_Hello_world[] = "Hello world." ;static __Pyx_StringTabEntry __pyx_string_tab[] = { {&__pyx_kp_s_1, __pyx_k_1, sizeof (__pyx_k_1), 0 , 0 , 1 , 0 }, {&__pyx_kp_s_Hello_world, __pyx_k_Hello_world, sizeof (__pyx_k_Hello_world), 0 , 0 , 1 , 0 }, {&__pyx_kp_s_I_am_str, __pyx_k_I_am_str, sizeof (__pyx_k_I_am_str), 0 , 0 , 1 , 0 }, {&__pyx_n_s_NameError, __pyx_k_NameError, sizeof (__pyx_k_NameError), 0 , 0 , 1 , 1 }, ...... {0 , 0 , 0 , 0 , 0 , 0 , 0 } } __Pyx_InitStrings(__pyx_string_tab)
__pyx_string_tab
将 PyObject
与对应的字符串关联起来,最后调用 __Pyx_InitStrings
初始化所有字符串对象。
在 IDA 逆向分析时,查找字符串引用,可以快速定位到__pyx_string_tab
,然后根据 __Pyx_StringTabEntry
定位程序对某个字符串 PyObject
的引用。
整数常量的构造 整数在 Cython 里面也是 PyObject
,整数常量属于全局变量,在 __Pyx_InitGlobals
中初始化
1 2 3 4 5 6 7 8 9 10 11 12 13 __pyx_float_1_4 = PyFloat_FromDouble(1.4 ); __pyx_int_0 = PyInt_FromLong(0 ); __pyx_int_1 = PyInt_FromLong(1 ); __pyx_int_2 = PyInt_FromLong(2 ); __pyx_int_3 = PyInt_FromLong(3 ); __pyx_int_4 = PyInt_FromLong(4 ); __pyx_int_5 = PyInt_FromLong(5 ); __pyx_int_6 = PyInt_FromLong(6 ); __pyx_int_22 = PyInt_FromLong(22 ); __pyx_int_33 = PyInt_FromLong(33 ); __pyx_int_100 = PyInt_FromLong(100 ); __pyx_int_123 = PyInt_FromLong(123 ); __pyx_int_1357463230989497419223659171037 = PyInt_FromString((char *)"1357463230989497419223659171037" , 0 , 0 );
函数声明 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 static PyObject *__pyx_pf_5hello_myfunc1(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_2test_variables(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_4test_strVar(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_6test_global_var(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_8test_cast(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_10test_numbers(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_12test_if(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_14test_string(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_16test_list(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_18test_dict(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_20test_for(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_22test_while(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_24test_exception(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_26test_datetime(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_28test_format(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_30test_math(CYTHON_UNUSED PyObject *__pyx_self); static PyObject *__pyx_pf_5hello_32test_arg(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_x, PyObject *__pyx_v_y, PyObject *__pyx_v_z); static PyObject *__pyx_pf_5hello_10test_class___init__(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self); static PyObject *__pyx_pf_5hello_10test_class_2test_class_hh(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self);
重点看一下 __pyx_pf_5hello_32test_arg
与 __pyx_pf_5hello_10test_class_2test_class_hh
__pyx_pf_5hello_32test_arg
的 py 定义是 def test_arg(x, y, z)
因此多了几个参数。
__pyx_pf_5hello_10test_class_2test_class_hh
是类函数 test_class.test_class_hh , 所以有一个 self 参数。
myfunc1() 分析 1 2 3 4 5 PyObject *__cdecl PyObject_Call (PyObject *callable_object, PyObject *args, PyObject *kw) ; v0 = PyTuple_Pack(1LL , __pyx_kp_s_This_is_myfunc1); v2 = PyObject_GetAttr(__pyx_b, __pyx_n_s_print) PyObject_Call(v2, v1, 0LL )
test_variables() 分析 1 2 def myfunc1 (): print("This is myfunc1." )
1 2 3 4 5 6 7 v0 = __pyx_kp_s_variables_test; x = PyLong_FromLong(5LL ); args = PyTuple_Pack(1LL , x); v6 = PyObject_GetAttr(__pyx_b, __pyx_n_s_print); PyObject_Call(v6, args, 0LL );
test_global_var() 分析 1 2 3 def test_global_var (): global gy print(gy)
1 2 3 4 5 6 v0 = PyObject_GetAttr(__pyx_b, __pyx_n_s_gy); v1 = PyTuple_Pack(1LL , v0); v3 = PyObject_GetAttr(__pyx_b, __pyx_n_s_print); PyObject_Call(v3, v2, 0LL );
test_numbers() 分析 1 2 3 4 5 def test_numbers (): x = 123 y = 12.3 z = 0x112233445566778899AABBCCDD print(x, y, z)
print 三个参数,构造元组。 decompiler 识别出来的数据结构有问题
1 2 3 4 5 mov edi, 3 ; size call _PyTuple_New mov [rax+18h], r12 ; 123 mov [rax+20h], r15 ; 12.3 mov [rax+28h], r14 ; 0x112233445566778899AABBCCDD
这段代码也可以看出 Tuple 构造方法
test_if() 分析 1 2 3 4 5 def test_if (x ): if x > 456 : print("x > 456" ) else : print("x <= 456" )
1 2 3 4 5 6 7 8 v2 = PyObject_RichCompare(a2, __pyx_int_456, 4 ); v5 = PyObject_IsTrue(v2);if ( v5 ) goto LABEL_8;goto LABEL_13; LABEL_8 => 输出 x > 456 LABEL_13 => 输出 x <= 456
test_for() 分析 1 2 3 4 5 def test_for (): s = 0 for i in range (101 ): s = s + i print(s)
1 2 3 4 5 6 7 8 9 10 11 v3 = PyLong_FromLong(0LL); v0 = __pyx_int_0; v2 = 0 while ( 1 ) { v0 = PyNumber_Add(v0, v3); ++v2; if ( v2 >= 101 ) break; v3 = PyLong_FromLong(v2); }. .... print 代码忽略 ......
range(101) 直接翻译成等价最优形式了。
test_string() 分析 1 2 3 4 5 6 7 8 9 10 11 def test_string (): x = "I am str." y = len (x) z = x[1 ] w = x[2 :] print(x, y, z, w) if "am" in x: print("yes" ) else : print("wrong" )
下标访问 z = x[1]
1 2 3 4 5 6 7 v0 = __pyx_kp_s_I_am_str; item = (v4->sq_item)(v0, 1LL ); 或者 v31 = PyLong_FromSsize_t(1LL ); item = PyObject_GetItem(v0, v31);
切片访问 w = x[2:]
1 2 3 v0 = __pyx_kp_s_I_am_str; __pyx_slice__2 = PySlice_New(__pyx_int_2, &_Py_NoneStruct, &_Py_NoneStruct); v6 = (v0->ob_type->tp_as_mapping->mp_subscript)(v0, __pyx_slice__2);
in
关键字
1 v10 = PySequence_Contains(v1, __pyx_n_s_am)
test_list() 分析 1 2 3 4 5 6 7 8 9 10 11 12 13 def test_list (): x = list () x.append(1 ) x.append(2 ) x.append(3 ) x.append(4 ) x.append("five" ) print(x) print(len (x)) for i in x: print(i) x = x[1 :] x[2 :4 ] = [22 , 33 ]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 v0 = PyList_New(0LL); PyList_Append(v1, __pyx_int_1); PyList_Append(v1, __pyx_int_2); ...... PyList_Append(v1, __pyx_n_s_five); ; for i in x: ; print(i) mov rax, [rbx+18h] // 列表开始 mov r15, [rax+r12*8] // r12 下标 ; x = x[1:] call _PyList_New mov rcx, rax mov rax, [rbp+var_38] ; var_38 是原来的 x mov r12, [rax+18h] ; 第一个 PyObect * 地址 lea rsi, [r12+8] ; [1:] 跳过一个元素 mov rdi, [rcx+18h] ; __dst lea r15, [r13-1] lea rdx, ds:0[r15*8] ; __n r15 复制的元素的个数 call _memcpy ;x[2:4] = [22, 33] ;构造 [22, 33] mov edi, 2 call _PyList_New mov r13, rax mov rax, cs:___pyx_int_22 mov rcx, [r13+18h] mov [rcx], rax mov rax, cs:___pyx_int_33 mov rcx, [r13+18h] mov [rcx+8], rax mov [rbp+var_40], rcx ; x[2:4] = ;创建切片 mov edi, 2 ; r12 = _PyLong_FromSsize_t(2) call _PyLong_FromSsize_t mov r12, rax mov edi, 4 ; r15 = _PyLong_FromSsize_t(4) call _PyLong_FromSsize_t mov r15, rax mov r13, cs:__Py_NoneStruct_ptr mov rdi, r12 ; start: 2 mov rsi, rax ; stop : 4 mov rdx, r13 ; step __Py_NoneStruct_ptr call _PySlice_New mov [rbp+var_48], rax ;给切片赋值 mov r12, [rbp+var_48] mov rdi, rbx ;o mov rsi, r12 ;key, 切片 mov rdx, [rbp+var_38] ;v mov rax, [rbp+var_40] call qword ptr [rax+10h] ;mp_ass_subscript int PyObject_SetItem(PyObject *o, PyObject *key, PyObject *v)
test_dict() 分析 1 2 3 4 5 6 7 8 9 10 11 12 def test_dict (): x = {} x["one" ] = 1 x["two" ] = 2 x["three" ] = 3 y = x["one" ] z = x["two" ] if "one" in x: print(y) for k in x: print(k, x[k])
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 ; x = {} call _PyDict_New mov rbx, rax ; rbx => x PyObject * ; x["one"] = 1 mov rsi, cs:___pyx_n_s_one ; key mov rdx, cs:___pyx_int_1 ; v mov rdi, rbx ; o call _PyDict_SetItem ;int PyObject_SetItem(PyObject *o, PyObject *key, PyObject *v) ; x["two"] = 2 mov rsi, cs:___pyx_n_s_two ; key mov rdx, cs:___pyx_int_2 ; item mov rdi, rbx ; mp call _PyDict_SetItem ; x["three"] = 3 mov rsi, cs:___pyx_n_s_three ; key mov rdx, cs:___pyx_int_3 ; item mov rdi, rbx ; mp call _PyDict_SetItem ; y = x["one"] mov rsi, cs:___pyx_n_s_one mov rdi, rbx call ___Pyx_PyDict_GetItem mov [rbp+var_40], rax ; z = x["two"] mov rsi, cs:___pyx_n_s_two mov rdi, rbx call ___Pyx_PyDict_GetItem mov [rbp+var_38], rax ; if "one" in x mov rsi, cs:___pyx_n_s_one ; key mov rdi, rbx ; mp call _PyDict_Contains ; key in mp retrun 1, not in return 0, error: -1 ; for k in x loop_start: mov rdi, rbx ; o lea rsi, [rbp+pos] ; pos lea rdx, [rbp+key] ; key lea rcx, [rbp+value] ; value call _PyDict_Next ; int PyDict_Next(PyObject *p, Py_ssize_t *ppos, PyObject **pkey, PyObject **pvalue) ....... call ___Pyx_PyDict_GetItem ....... call _PyTuple_New ..... call _PyTuple_Pack .... call print. ..... call _PyDict_Size ......
test_datetime () 分析 1 2 3 def test_datetime (): x = datetime.datetime.now() print(x)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 mov rbx, cs:___pyx_n_s_datetime mov rdx, [rbx+18h] mov rsi, rbx call __PyDict_GetItem_KnownHash mov r15, rax mov rsi, cs:___pyx_n_s_datetime mov rdi, r15 ; PyObject * call _PyObject_GetAttr mov rbx, rax mov rsi, cs:___pyx_n_s_now ;now mov rdi, rbx call _PyObject_GetAttr mov r15, rax mov rsi, cs:___pyx_empty_tuple mov rdi, r15 ; callable_object call ___Pyx_PyObject_Call ........ print(x) 略 ........
1 2 3 4 5 def test_format (): x = 1 y = "One" z = "%s is %d" % (y, x) print(z)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 mov r14, cs:___pyx_n_s_One mov edi, 1 ; __int64 call _PyLong_FromLong mov r15, rax ; r15 = _PyLong_FromLong(1) mov edi, 2 ; size call _PyTuple_New mov rbx, rax mov [rax+18h], r14 ; "One" mov [rax+20h], r15 ; 1 mov rdi, cs:___pyx_kp_s_s_is_d ; %s is %d mov rsi, rax ; Tuple("One", 1) call _PyUnicode_Format
dump PyObject * 1 2 3 4 5 .text:00007FFBC501B615 mov rcx, rbx .text:00007FFBC501B618 call python38_PyObject_Str .text:00007FFBC501B618 .text:00007FFBC501B61D mov rcx, rax .text:00007FFBC501B620 call python38_PyUnicode_AsUTF8
这样就可以得到 PyObject * 的字符串 dump 信息。
1 2 3 4 5 6 7 8 9 10 11 12 13 function DumpPyObject (address ) { if (Process.arch === "x64" ) { var native_address = new NativePointer(address); var module = Process.findModuleByName("python38.dll" ); var PyObject_Str = new NativeFunction(module .findExportByName("PyObject_Str" ), "pointer" , ["pointer" ], "win64" ); var PyUnicode_AsUTF8 = new NativeFunction(module .findExportByName("PyUnicode_AsUTF8" ), "pointer" , ["pointer" ], "win64" ); var obj = PyObject_Str(native_address); var p = PyUnicode_AsUTF8(obj); console .log("okkk.." ); console .log(p.readUtf8String()); } }
参考文章 [1] “The Basics of Cython” https://cython.readthedocs.io/en/latest/src/tutorial/cython_tutorial.html
[2] “Object Protocol” https://docs.python.org/3/c-api/object.html
[3] “Type Objects” https://docs.python.org/3/c-api/typeobj.html#c.PySequenceMethods.sq_item