One main method of the unit is the following:

 function TThreadMemManager.GetMem(aSize: NativeUInt): Pointer;
 var
   bm: PMemBlockList;
 begin
   if aSize <= (length(FMiniMemoryBlocks)*32) then
     if aSize > 0 then
       // blocks of 32: 32, 64, 96, 128, 160, 192, 224
       bm := @FMiniMemoryBlocks[(aSize-1) shr 5] else
     begin
       Result := nil;
       Exit;
     end
   else if aSize <= (length(FSmallMemoryBlocks)*256) then
     // blocks of 256: 256,512,768,1024,1280,1536,1792 bytes
     bm := @FSmallMemoryBlocks[(aSize-1) shr 8]
 {$ifdef USEMEDIUM}
   else if aSize <= (length(FMediumMemoryBlocks)*2048) then
     // blocks of 2048: 2048, 4096... bytes
     bm := @FMediumMemoryBlocks[(aSize-1) shr 11]
 {$endif}
   else
   begin
     // larger blocks are allocated via the old Memory Manager
     Result := GetOldMem(aSize);
     Exit;
   end;
   if FOtherThreadFreedMemory <> nil then
     ProcessFreedMemFromOtherThreads;
   with bm^ do
   begin
     if FFirstFreedMemBlock <> nil then
       // first get from freed mem (fastest because most chance?)
       Result := FFirstFreedMemBlock.GetUsedMemoryItem else
       // from normal list
       Result := GetMemFromNewBlock;
   end;
   Result  := Pointer(NativeUInt(Result) + SizeOf(TMemHeader));
 end;

This method is usually called from the Delphi Getmem() function. This method was declared inline, that is the code will be like if it was typed in the source, whereas a not inlined method will just be called in place.

In some cases, the asm code produced by the Delphi compiler was really optimized. For instance, when a pm := Owner.GetMem(SizeOf(pm^)); is called, it will generate this asm instructions:

SynScaleMM.pas.752: pm := Owner.GetMem(SizeOf(pm^));
0040B7FA 8B4308           mov eax,[ebx+$08]
0040B7FD BA02000000       mov edx,$00000002
0040B802 8D74D074         lea esi,[eax+edx*8+$74]
0040B806 833800           cmp dword ptr [eax],$00
0040B809 7405             jz $0040b810
0040B80B E810FEFFFF       call TThreadMemManager.ProcessFreedMemFromOtherThreads
0040B810 8BC6             mov eax,esi
0040B812 8B10             mov edx,[eax]
0040B814 85D2             test edx,edx
0040B816 7409             jz $0040b821
0040B818 8BC2             mov eax,edx
0040B81A E889FFFFFF       call TMemBlock.GetUsedMemoryItem
0040B81F EB05             jmp $0040b826
0040B821 E8EA000000       call TMemBlockList.GetMemFromNewBlock
0040B826 83C008           add eax,$08
0040B829 8BF0             mov esi,eax

As you can see, all if aSize <= (length(FMiniMemoryBlocks)*32) then branches have been changed into a fixed branch, because in our case the aSize parameter was a constant (in our case, SizeOf(pm^)).

Modern compilers rock!
Inlining is one of the only features I'm missing with Delphi 7. But the same code compiled with Delphi 7 (without the inlining, I wanted our fork to compile from Delphi 6 up to XE), performs very well. A little less aggressive, but it does scale much better than FastMM4, in all cases!

Feedback is welcome on our forum!