Jump to content
Tuts 4 You

Help me rebuild this challenging reversed function of a raw bare metal firmware


Recommended Posts

CuriousReverser
Posted (edited)

I have this code from a tricore processor and I am trying to reverse engineer it in order to rebuild it using c.
It is a seed -> key algorithm, that get's a 4byte seed passed (d4 register) does some algorithm to it and returns a 4byte key (d2). In the calculation the d1 register gets used as well.
First the extracted assembly:

movh       d0,#0x7777                                       d0 now 0x7777'0000
sha        d15,a4,#-0x1f                                  	d15 = a4 >> 0x1f
addi       d0,d0,#0x123                                     +0x123 d0 now 0x77770123
mul        e0,a4,d0                                       	e0 = a4 * d0
sha        d0,d1,#-0xd                                      d0 = d1 >> 0xd
sub        d0,d15                                           d0 = d0 - d15
mov        d15,#0x787c                                      d15 = 0x787c
msub       a4,a4,d0,d15                                 	a4 = a4 - (d0 * d15)
mov        d15,#0x41a3                                      d15 = 0x41a3
mul        a4,d15                                         	a4 = a4 * d15
mov        d15,#0xb15                                       d15 = 0xb15
msub       a4,a4,d0,d15                                 	a4 = a4 - (d0 * d15)
mov        d15,a4                                         	d15 = a4
jgtz       a4,LAB_AL1_HANDLER                             	if a4 is > 0 jump to ..
addih      d15,a4,#0x8000                                 	else make positive
add        d15,#-0x1
LAB_AL1_HANDLER                                 				 
movh       d6,#0xA3A3                                       d6 now 0xA3A3'0000
movh       d5,#0x26ba                              			d5 now 0xB4B4'0000
mov        d2,d15
mov        d3,#0x0
addi       d6,d6,#0x1234                                    d6 now 0xA3A3'1234
addi       d5,d5,#0x4321                  					d5 now 0xB4B4'4321
mov        a4,#0x13
LAB_AL1_LOOP                                    				
sha        d15,d2,#0x1
ge         d0,d2,#0x0
xor        d1,d15,d6
xor        d15,d5
sel        d2,d0,d15,d1
add        d15,d3,#0x1
extr.u     d3,d15,#0x0,#0x10
jlt.u      d3,a4,LAB_AL1_LOOP
nop
ret

Ghidra was extemely helpfull with reversing of the whole firmware and I love it for that, but unfortunately does not really get behind the mathematics of this function I fear. Here is its try in decompiling:
 

uint getKeyForSeed(int seed)
{
  byte bVar1;
  uint uVar2;
  ushort uVar3;
  
  uVar2 = seed * 0x41a3 + ((seed * 0x77770123 >> 0x1f) - (seed >> 0x1f)) * -0x1ff43ea9;
  if ((int)uVar2 < 1) {
                    /* Make positive */
    uVar2 = uVar2 + 0x7fffffff;
  }
  uVar3 = 0;
  do {
    uVar2 = (uVar2 * 2 ^ 0xA3A31234) * (uint)(-1 < (int)uVar2) + (uVar2 * 2 ^ 0xB4B44321) * (uint)(-1 >= (int)uVar2);
    uVar3 = uVar3 + 1;
  } while (uVar3 < 0x13);
  return uVar2;
}

As you can see, ghidra for example does not take into account the "sha d0,d1,#-0xd" access to d1. Now please see my try to do the most simple translation possible:
 

uint32_t getKeyForSeed_Custom(int32_t a4, int32_t d1)
{
    int32_t d15 = a4 >> 0x1f;                         // sha d15,a4,#-0x1f
    int32_t d0 = 0x77770123;                            // movh d0,#0x7777 ... addi d0,d0,#0x123
    int64_t e0 = (int64_t)a4 * (int64_t)d0;           // mul e0,a4,d0     
    d0 = e0 & 0xffffffff;
    d0 = d1 >> 0xd;                                     // sha d0,d1,#-0xd
    e0 = e0 & d0;
    d0 = d0 - d15;                                      // sub d0,d15
    d15 = 0x787c;                                       // mov d15,#0x787c
    a4 = a4 - (d0 * d15);                           // msub a4,a4,d0,d15
    d15 = 0x41a3;                                       // mov d15,#0x41a3
    a4 = a4 * d15;                                  // mul a4,d15
    d15 = 0xb15;                                        // mov d15,0xb15
    a4 = a4 - (d0 * d15);                           // msub a4,a4,d0,d15
    d15 = a4;                                         // mov d15, a4

    if (a4 > 0) {                                     // jgtz a4,AFTERADD
        goto AFTERADD;
    }

    d15 = a4 + 0x7fffffff;                            // making d15 a positive version of a4

    AFTERADD:
    int32_t d6 = 0xA3A31234;                            // movh d6,#0xa3a3 ... addi d6,d6,#0x1234
    int32_t d5 = 0xB4B44321;                            // movh d5,#0xb4b4 ... addi d5,d5,#0x4321
    int32_t d2 = d15;                                   // mov d2,d15
    int32_t d3 = 0x0;                                   // mov d3,0x0
    int32_t d4 = 0x13;                                  // mov a4,#0x13

    LOOP:
        d15 = d2 << 0x1;                                // sha d15,d2,#0x1
        d0 = d2 >= 0x0;                                 // ge d0,d2,#0x0
        d1 = d15 ^ d6;                                  // xor d1,d15,d6
        d15 = d15 ^ d5;                                 // xor d15,d5
        d2 = d0 != 0 ? d15 : d1;                        // sel d2,d0,d15,d1
        d15 = d3 + 0x1;                                 // add d15,d3,#0x1
        d3 = (d15 >> 0x0) & ((1u << 0x10) - 1);         // extr.u d3,d15,#0x0,#0x10

    if (d3 < d4) {
        goto LOOP;
    }

    return d2;
}

Now the reason im posting this is obviously because my code gets different and therefor wrong results.
I am very grateful for any kind of hints towards what I am doing wrong.

Endianess of the mcu is litle endian just as my machine obviously.

Processor: Tricore TC1766 
Instruction-Set documented here: https://www.infineon.com/dgdl/tc_v131_instructionset_v138.pdf?fileId=db3a304412b407950112b409b6dd0352
Best regards

Edited by CuriousReverser
logical mistake
Posted

Having fun with ECUs, are we? :D

I've reversed code for bunch of different architectures but have not worked with the Tricore CPUs before. I imagine my process would be like this:

  • set up a debugging environment for TC1766. Either get some evaluation board, official IDE from Infineon, QEMU or whatever is available for the platform;
  • open 2 debuggers, one for TC1766, other with your implementation of the routine;
  • pass the same parameters to both routines;
  • take a single step at a time and see where the values/code flow diverge;

If debugging TC1766 code is not possible, next best thing would be to have a bunch of combinations "seed + d1 -> key" for testing. Then trying to fix your implementation until it gives the expected result.
And only if no other options were available, I'd spend my time blankly staring at disassembled code and trying to figure out statically. It's slow and painful. ;) 

CuriousReverser
Posted

Thank you for your input so far.
Interestingly appearently there are no (longer?) any freely available evaluation boards that i can find. Probably because this mcu is so limited to the special use case of automotive products.
QEMU might do the trick once I did all the setup and provided the tricore emulation works as intended in this case.
I will probably have to go this way.
 

Actually I do have an nearly infinite amount of seed -> key combinations, "Then trying to fix your implementation until it gives the expected result." is the part I struggle with.

To be honest I was probably kind of hoping there would be one or a few obvious flaws in my implementation that someone more experienced would spot. 

I will try to setup the emulation and keep hoping for that one stupid mistake.

Posted

I'm not using Ghidra (IDA all the way for me) but I imagine there should be a way to tell it that function accepts 2 parameters. Perhaps then the decompiled code would make more sense.

Unfortunately I won't have any free time until middle of next week, because this sounds like a fun problem to look at. :)

 

BTW, Tricore reference manual says this:

Quote

E[n] Extended data register n containing a 64-bit value made from an even/odd pair ofregisters (D[n], D[n+1]). The format is little endian. E[n][63:32] = D[n+1][31:0]; E[n][31:0] = D[n][31:0]

So, D1 is most likely not passed as argument, but is part of a result of the "mul" operation. Ghidra sort-of recognized it, your code definitely doesn't.

  • Like 1
CuriousReverser
Posted

Wow, you are absolutely right about the d1 register! Ghidra was right about that, and my code was wrong about that.
I fixed this in my code and still don't get the correct result.
However I went through the struggle of getting of understanding and building a working qemu-tricore, gdb-tricore and got single-step debugging of an emulated tricore cpu.
Finished this right now and debugging the function in this very minutes. For all the curious I will share the result in this thread.

Thank you for your input and idea about emulating + debugging using qemu.

CuriousReverser
Posted

I was sooo damn close 🫠

It was pretty much the "E[n][63:32] = D[n+1][31:0]; E[n][31:0] = D[n][31:0]" thing. 
The final result for all the curious here is the final result and correct translation:

uint32_t getKeyForSeed_Custom(int32_t a4, int32_t d1)
{
    int32_t d15 = a4 >> 0x1f;                         // sha d15,a4,#-0x1f
    int32_t d0 = 0x77770123;                            // movh d0,#0x7777 ... addi d0,d0,#0x123
  
    int64_t e0 = (int64_t)seed * (int64_t)d0;           // mul e0,seed,d0     
    d0 = e0 & 0xffffffff;
    int32_t d1 = (e0 >> 32) & 0xffffffff;
  
    d0 = d1 >> 0xd;                                     // sha d0,d1,#-0xd
    d0 = d0 - d15;                                      // sub d0,d15
    d15 = 0x787c;                                       // mov d15,#0x787c
    a4 = a4 - (d0 * d15);                           // msub a4,a4,d0,d15
    d15 = 0x41a3;                                       // mov d15,#0x41a3
    a4 = a4 * d15;                                  // mul a4,d15
    d15 = 0xb15;                                        // mov d15,0xb15
    a4 = a4 - (d0 * d15);                           // msub a4,a4,d0,d15
    d15 = a4;                                         // mov d15, a4

    if (a4 > 0) {                                     // jgtz a4,AFTERADD
        goto AFTERADD;
    }

    d15 = a4 + 0x7fffffff;                            // making d15 a positive version of a4

    AFTERADD:
    int32_t d6 = 0xA3A31234;                            // movh d6,#0xa3a3 ... addi d6,d6,#0x1234
    int32_t d5 = 0xB4B44321;                            // movh d5,#0xb4b4 ... addi d5,d5,#0x4321
    int32_t d2 = d15;                                   // mov d2,d15
    int32_t d3 = 0x0;                                   // mov d3,0x0
    int32_t d4 = 0x13;                                  // mov a4,#0x13

    LOOP:
        d15 = d2 << 0x1;                                // sha d15,d2,#0x1
        d0 = d2 >= 0x0;                                 // ge d0,d2,#0x0
        d1 = d15 ^ d6;                                  // xor d1,d15,d6
        d15 = d15 ^ d5;                                 // xor d15,d5
        d2 = d0 != 0 ? d15 : d1;                        // sel d2,d0,d15,d1
        d15 = d3 + 0x1;                                 // add d15,d3,#0x1
        d3 = (d15 >> 0x0) & ((1u << 0x10) - 1);         // extr.u d3,d15,#0x0,#0x10

    if (d3 < d4) {
        goto LOOP;
    }

    return d2;
}

Thanks for the help

  • Thanks 3
  • 8 months later...
Posted
On 2023/9/1 at PM9点51分, CuriousReverser said:

哇,你对 d1 寄存器的看法绝对正确! Ghidra 对此是正确的,而我的代码对此是错误的。
我在代码中修复了这个问题,但仍然没有得到正确的结果。
然而,我经历了理解和构建工作 qemu-tricore、gdb-tricore 的斗争,并对模拟三核 cpu 进行了单步调试。
现在就完成了这个工作并在几分钟之内调试了该功能。对于所有好奇的人,我将在此线程中分享结果。

感谢您提供有关使用 qemu 进行模拟和调试的意见和想法。

Recently, I also encountered a problem that I need to debug the tc1766 firmware, the tricore-qemu compiled with reference to Simos18 cannot simulate the running firmware, I would like to consult how you make the firmware simulate to run

  • Like 1
  • 4 weeks later...

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now
×
×
  • Create New...