Here is the resulting code, which should work from Delphi 7 up to 2009 (I don't have the 2010 sources, so I don't know if something was changed in the record RTTI with this version, but I guess not).

procedure _CopyRecord{ dest, source, typeInfo: Pointer };
asm  // faster version by AB
{ ->    EAX pointer to dest             }
{       EDX pointer to source           }
{       ECX pointer to typeInfo         }
push ebp
push ebx
push esi
push edi
movzx ebx,byte ptr [ecx].TTypeInfo.Name[0]
mov esi,edx                     // esi = source
mov edi,eax                     // edi = dest
add ebx,ecx                     // ebx = TFieldTable
xor eax,eax                     // eax = current offset
mov ebp,[ebx].TFieldTable.Count // ebp = TFieldInfo count
mov ecx,[ebx].TFieldTable.Size
test ebp,ebp
jz @fullcopy
push ecx                        // sizeof(record) on stack
add ebx,offset TFieldTable.Fields[0]   // ebx = first TFieldInfo
@next:  mov ecx,[ebx].TFieldInfo.ValueOffset
mov edx,[ebx].TFieldInfo.TypeInfo
sub ecx,eax
mov edx,[edx]
jle @nomov
lea esi,esi+ecx
lea edi,edi+ecx
neg ecx
@mov1:  mov al,[esi+ecx] // fast copy not destructable data
mov [edi+ecx],al
inc ecx
jnz @mov1
@nomov: mov eax,edi
movzx ecx,[edx]    // data type
cmp ecx,tkLString
je @@LString
jb @@err
cmp ecx,tkDynArray
je @@DynArray
ja @@err
jmp dword ptr [ecx*4+@@tab-tkWString*4]
@@Tab:  dd @@WString,@@Variant,@@Array,@@Record,@@Interface,@@err
@@errv: mov al,reVarInvalidOp
jmp @@err2
@@err:  mov al,reInvalidPtr
@@err2: pop edi
pop esi
pop ebx
pop ebp
jmp Error
nop // all functions below have esi=source edi=dest
movzx ecx,byte ptr [edx].TTypeInfo.Name[0]
push dword ptr [edx+ecx].TFieldTable.Size
push dword ptr [edx+ecx].TFieldTable.Count
mov ecx,dword ptr [edx+ecx].TFieldTable.Fields[0]
mov ecx,[ecx]
mov edx,esi
call _CopyArray
pop eax // restore sizeof(Array)
jmp @@finish
movzx ecx,byte ptr [edx].TTypeInfo.Name[0]
mov ecx,[edx+ecx].TFieldTable.Size
push ecx
mov ecx,edx
mov edx,esi
call _CopyRecord
pop eax // restore sizeof(Record)
jmp @@finish
mov ecx,[VarCopyProc]
mov edx,esi
or ecx,ecx
jz @@errv
call ecx
mov eax,16
jmp @@finish
mov edx,[esi]
call _IntfCopy
jmp @@fin4
mov ecx,edx // ecx=TypeInfo
mov edx,[esi]
call _DynArrayAsg
jmp @@fin4
{$ifndef LINUX}
mov edx,[esi]
call _WStrAsg
jmp @@fin4
mov edx,[esi]
call _LStrAsg
@@fin4: mov eax,4
add esi,eax
add edi,eax
add eax,[ebx].TFieldInfo.ValueOffset
dec ebp    // any other TFieldInfo?
lea ebx,ebx+8 // next TFieldInfo
jnz @next
pop ecx // ecx= sizeof(record)
mov edx,edi
sub ecx,eax
mov eax,esi
jle @nomov2
call move
@nomov2:pop edi
pop esi
pop ebx
pop ebp

I've tested this source code with some unit testing, and IMHO it works fine. Speed increase is noticeable. At least my code is much more readable than the original from Borland/Embarcadero, since I detailed the field names (TFieldInfo/TFieldData), and commented the source.

If you can guess if my inlined code in @mov1 is faster than a call move, please tell me!

The code and test function can be downloaded from