Here is the resulting code, which should work from Delphi 7 up to 2009 (I don't have the 2010 sources, so I don't know if something was changed in the record RTTI with this version, but I guess not).

procedure _CopyRecord{ dest, source, typeInfo: Pointer };
asm  // faster version by AB
{ ->    EAX pointer to dest             }
{       EDX pointer to source           }
{       ECX pointer to typeInfo         }
push ebp
push ebx
push esi
push edi
movzx ebx,byte ptr [ecx].TTypeInfo.Name[0]
mov esi,edx                     // esi = source
mov edi,eax                     // edi = dest
add ebx,ecx                     // ebx = TFieldTable
xor eax,eax                     // eax = current offset
mov ebp,[ebx].TFieldTable.Count // ebp = TFieldInfo count
mov ecx,[ebx].TFieldTable.Size
test ebp,ebp
jz @fullcopy
push ecx                        // sizeof(record) on stack
add ebx,offset TFieldTable.Fields[0]   // ebx = first TFieldInfo
@next:  mov ecx,[ebx].TFieldInfo.ValueOffset
mov edx,[ebx].TFieldInfo.TypeInfo
sub ecx,eax
mov edx,[edx]
jle @nomov
lea esi,esi+ecx
lea edi,edi+ecx
neg ecx
@mov1:  mov al,[esi+ecx] // fast copy not destructable data
mov [edi+ecx],al
inc ecx
jnz @mov1
@nomov: mov eax,edi
movzx ecx,[edx]    // data type
cmp ecx,tkLString
je @@LString
jb @@err
cmp ecx,tkDynArray
je @@DynArray
ja @@err
jmp dword ptr [ecx*4+@@tab-tkWString*4]
@@Tab:  dd @@WString,@@Variant,@@Array,@@Record,@@Interface,@@err
@@errv: mov al,reVarInvalidOp
jmp @@err2
@@err:  mov al,reInvalidPtr
@@err2: pop edi
pop esi
pop ebx
pop ebp
jmp Error
nop // all functions below have esi=source edi=dest
@@Array:
movzx ecx,byte ptr [edx].TTypeInfo.Name[0]
push dword ptr [edx+ecx].TFieldTable.Size
push dword ptr [edx+ecx].TFieldTable.Count
mov ecx,dword ptr [edx+ecx].TFieldTable.Fields[0]
mov ecx,[ecx]
mov edx,esi
call _CopyArray
pop eax // restore sizeof(Array)
jmp @@finish
@@Record:
movzx ecx,byte ptr [edx].TTypeInfo.Name[0]
mov ecx,[edx+ecx].TFieldTable.Size
push ecx
mov ecx,edx
mov edx,esi
call _CopyRecord
pop eax // restore sizeof(Record)
jmp @@finish
nop;nop;nop
@@Variant:
mov ecx,[VarCopyProc]
mov edx,esi
or ecx,ecx
jz @@errv
call ecx
mov eax,16
jmp @@finish
nop;nop;nop
@@Interface:
mov edx,[esi]
call _IntfCopy
jmp @@fin4
nop
@@DynArray:
mov ecx,edx // ecx=TypeInfo
mov edx,[esi]
call _DynArrayAsg
jmp @@fin4
@@WString:
{$ifndef LINUX}
mov edx,[esi]
call _WStrAsg
jmp @@fin4
nop;nop
{$endif}
@@LString:
mov edx,[esi]
call _LStrAsg
@@fin4: mov eax,4
@@finish:
add esi,eax
add edi,eax
add eax,[ebx].TFieldInfo.ValueOffset
dec ebp    // any other TFieldInfo?
lea ebx,ebx+8 // next TFieldInfo
jnz @next
pop ecx // ecx= sizeof(record)
@fullcopy:
mov edx,edi
sub ecx,eax
mov eax,esi
jle @nomov2
call move
@nomov2:pop edi
pop esi
pop ebx
pop ebp
end;

I've tested this source code with some unit testing, and IMHO it works fine. Speed increase is noticeable. At least my code is much more readable than the original from Borland/Embarcadero, since I detailed the field names (TFieldInfo/TFieldData), and commented the source.

If you can guess if my inlined code in @mov1 is faster than a call move, please tell me!

The code and test function can be downloaded from http://synopse.info/files/CopyRecord.pas