可能是最快的算法alpha blend汇编源代码.docx
- 文档编号:7290379
- 上传时间:2023-01-22
- 格式:DOCX
- 页数:13
- 大小:18.77KB
可能是最快的算法alpha blend汇编源代码.docx
《可能是最快的算法alpha blend汇编源代码.docx》由会员分享,可在线阅读,更多相关《可能是最快的算法alpha blend汇编源代码.docx(13页珍藏版)》请在冰豆网上搜索。
可能是最快的算法alphablend汇编源代码
可能是最快的算法alphablend汇编源代码,Intel官方提供
Intel官方网站有一个ablend_565的快速汇编算法,理论上是是把一块32bitRGBA渲染到16bit的buffer上,我的机器是PIII800,函数在systemmenory中进行,640*480的256级alphablending,达到100fps,我想可以满足绝大部分的要求了,在这里,我提供了这个算法的应用,希望可以对大家有所帮助。
ablend_565函数,源代码可以直接编译使用,无需其他库函数,感谢intel提供这么好的东西。
首先,我提供一些本人编写的把32bittga文件读入pRGBABuffer的函数
文件尺寸保存在width,height
//-----------------------------------------------------------------------
//Name:
LoadTgaFile(TCHAR*strPathname,DWORD**pRGBABuffer,long*width,long*height)
//Desc:
读取32bittga文件到DWORD缓冲里,返回其尺寸
//Time:
2002.06.2200:
36
//Author:
RealRender
//Para:
//Return:
//Note:
这段代码来自directx7.0sample中的d3dtextr.cpp,我把他提取了出来
//方便使用
//-----------------------------------------------------------------------
BOOLLoadTgaFile(TCHAR*strPathname,DWORD**pRGBABuffer,long*width,long*height)
{
FILE*file=fopen(strPathname,"rb");
if(NULL==file)
returnfalse;
structTargaHeader
{
BYTEIDLength;
BYTEColormapType;
BYTEImageType;
BYTEColormapSpecification[5];
WORDXOrigin;
WORDYOrigin;
WORDImageWidth;
WORDImageHeight;
BYTEPixelDepth;
BYTEImageDescriptor;
}tga;
fread(&tga,sizeof(TargaHeader),1,file);
//Onlytruecolor,non-mappedimagesaresupported
if((0!
=tga.ColormapType)||
(tga.ImageType!
=10&&tga.ImageType!
=2))
{
fclose(file);
returnfalse;
}
//SkiptheIDfield.Thefirstbyteoftheheaderisthelengthofthisfield
if(tga.IDLength)
fseek(file,tga.IDLength,SEEK_CUR);
DWORDm_dwWidth=tga.ImageWidth;
DWORDm_dwHeight=tga.ImageHeight;
DWORDm_dwBPP=tga.PixelDepth;
DWORD*m_pRGBAData=newDWORD[m_dwWidth*m_dwHeight];
if(m_pRGBAData==NULL)
{
fclose(file);
returnfalse;
}
for(DWORDy=0;y { DWORDdwOffset=y*m_dwWidth; if(0==(tga.ImageDescriptor&0x0010)) dwOffset=(m_dwHeight-y-1)*m_dwWidth; for(DWORDx=0;x { if(tga.ImageType==10) { BYTEPacketInfo=getc(file); WORDPacketType=0x80&PacketInfo; WORDPixelCount=(0x007f&PacketInfo)+1; if(PacketType) { DWORDb=getc(file); DWORDg=getc(file); DWORDr=getc(file); DWORDa=0xff; if(m_dwBPP==32) a=getc(file); while(PixelCount--) { m_pRGBAData[dwOffset+x]=(r<<24L)+(g<<16L)+(b<<8L)+(a); x++; } } else { while(PixelCount--) { BYTEb=getc(file); BYTEg=getc(file); BYTEr=getc(file); BYTEa=0xff; if(m_dwBPP==32) a=getc(file); m_pRGBAData[dwOffset+x]=(r<<24L)+(g<<16L)+(b<<8L)+(a); x++; } } } else { BYTEb=getc(file); BYTEg=getc(file); BYTEr=getc(file); BYTEa=0xff; if(m_dwBPP==32) a=getc(file); m_pRGBAData[dwOffset+x]=(r<<24L)+(g<<16L)+(b<<8L)+(a); x++; } } } fclose(file); //Checkforalphacontent for(DWORDi=0;i<(m_dwWidth*m_dwHeight);i++) { if(m_pRGBAData[i]&0x000000ff! =0xff) { //m_bHasAlpha=TRUE; break; } } *pRGBABuffer=m_pRGBAData; *width=m_dwWidth; *height=m_dwHeight; returntrue; } 把32bitbuffer分割为rgb和alpha的代码。 注意,分割后的pBitmap一定要是8字节对齐,这是优化的一个重要条件,所以,我的算法中: BYTE*p=newBYTE[lSize*2+8]; BYTE*pOrig=p; p+=(DWORD)p%8; WORD*color=(WORD*)p; 这是不规范的写法,把指针强行改变为8位对齐,实际使用的时候,要记住释放的原始指针不是p,而是pOrig,在这里,我没有释放分配的内存,请谅解。 //----------------------------------------------------------------------- //Name: SplitRGBA(DWORD*pRGBABuffer,LPBYTE*pAlpha,LPWORD*pBitmap,longlWidth,longlHeight) //Desc: //Time: 2002.06.2200: 36 //Author: RealRender //Para: //Return: //Note: 把从32bit的缓冲建立16bit的565缓冲和8bit的alpha通道 //----------------------------------------------------------------------- voidSplitRGBA(DWORD*pRGBABuffer,LPBYTE*pAlpha,LPWORD*pBitmap,longlWidth,longlHeight) { longlSize=lWidth*lHeight; BYTE*alpha=newBYTE[lSize]; BYTE*p=newBYTE[lSize*2+8]; //强行转换为8字节对齐 p+=(DWORD)p%8; WORD*color=(WORD*)p; DWORDdwPixel; DWORDr,g,b,a; for(inti=0;i { dwPixel=pRGBABuffer[i]; r=((dwPixel>>24)&0x000000ff); g=((dwPixel>>16)&0x000000ff); b=((dwPixel>>8)&0x000000ff); a=((dwPixel>>0)&0x000000ff); alpha[i]=a; //888i转化为565 color[i]=RGBTo16(r,g,b); } *pAlpha=alpha; *pBitmap=color; } // 这个视intel官方提供的函数,函数的描述,用我的话来说就是把一个带有256级alpha通道的565颜色数据绘制到16位目标页面。 函数说明: unsignedchar*lpAlpha,//256级alpha通道 unsignedintiAlpPitch,//alpha通道的pitch unsignedchar*lpSrc,//原色彩缓冲 unsignedintiSrcX,// unsignedintiSrcY,//原色彩位置 unsignedintiSrcPitch,//原色彩pitch unsignedchar*lpDst,//目标缓冲 unsignedintiDstX, unsignedintiDstY,//目标位置 unsignedintiDstW, unsignedintiDstH,//目标缓冲的尺寸 unsignedintiDstPitch//目标缓冲的pitch voidablend_565(unsignedchar*lpAlpha,unsignedintiAlpPitch, unsignedchar*lpSrc,unsignedintiSrcX,unsignedintiSrcY, unsignedintiSrcPitch,unsignedchar*lpDst, unsignedintiDstX,unsignedintiDstY, unsignedintiDstW,unsignedintiDstH, unsignedintiDstPitch) { //Maskforisolatingthered,green,andbluecomponents static__int64MASKB=0x001F001F001F001F; static__int64MASKG=0x07E007E007E007E0; static__int64MASKSHIFTG=0x03F003F003F003F0; static__int64MASKR=0xF800F800F800F800; //constantsusedbytheintegeralphablendingequation static__int64SIXTEEN=0x0010001000100010; static__int64FIVETWELVE=0x0200020002000200; static__int64SIXONES=0x003F003F003F003F; unsignedchar*lpLinearDstBp=(iDstX<<1)+(iDstY*iDstPitch)+lpDst;//basepointerforlineardestination unsignedchar*lpLinearSrcBp=(iSrcX<<1)+(iSrcY*iSrcPitch)+lpSrc;//basepointerforlinearsource unsignedchar*lpLinearAlpBp=iSrcX+(iSrcY*iAlpPitch)+lpAlpha;//basepointerforlinearalpha _asm{ movesi,lpLinearSrcBp;//src movedi,lpLinearDstBp;//dst moveax,lpLinearAlpBp;//alpha movecx,iDstH;//ecx=numberoflinestocopy movebx,iDstW;//ebx=spanwidthtocopy testesi,6;//checkifsourceaddressisqwordaligned //sinceaddrcominginisalwayswordaligned(16bit) jnzdone;//ifnotqwordalignedwedon'tdoanything primeloop: movdmm1,[eax];//mm1=00000000a3a2a1a0 pxormm2,mm2;//mm2=0; movqmm4,[esi];//g1: mm4=src3src2src1src0 punpcklbwmm1,mm2;//mm1=00a300a200a100a0 loopqword: movedx,[eax]; testebx,0xFFFFFFFC;//checkifonly3pixelsleft jzcheckback;//3orlesspixelsleft //earlyouttests cmpedx,0xffffffff;//testforalphavalueof1 jecopyback;//if1'scopythesourcepixelstothedestination testedx,0xffffffff;//testforalphavalueof0 jzleavefront;//ifsogotothenext4pixels //thealphablendstarts //green //i=a*sg+(63-a)*dg; //i=(i+32)+((i+32)>>6)>>6; //red //i=a*sr+(31-a)*dr; //i=(i+16)+((i+16)>>5)>>5; movqmm5,[edi];//g2: mm5=dst3dst2dst1dst0 psrlwmm1,2;//mm1=a? >>2nukeoutlower2bits movqmm7,MASKSHIFTG;//g3: mm7=1bitshiftedgreenmask psrlwmm4,1;//g3a: movesrcgreendownby1sothatwewon'toverflow movqmm0,mm1;//mm0=00a300a200a100a0 psrlwmm5,1;//g3b: movedstgreendownby1sothatwewon'toverflow psrlwmm1,1;//mm1=a? >>1nukeoutlower1bits pandmm4,mm7;//g5: mm4=sg3sg2sg1sg0 movqmm2,SIXONES;//g4: mm2=63 pandmm5,mm7;//g7: mm5=dg3dg2dg1dg0 movqmm3,[esi];//b1: mm3=src3src2src1src0 psubsbmm2,mm0;//g6: mm2=63-a363-a263-a163-a0 movqmm7,MASKB;//b2: mm7=BLUEMASK pmullwmm4,mm0;//g8: mm4=sg? *a? movqmm0,[edi];//b3: mm0=dst3dst2dst1dst0 pmullwmm5,mm2;//g9: mm5=dg? *(1-a? ) movqmm2,mm7;//b4: mm2=fiveones pandmm3,mm7;//b4: mm3=sb3sb2sb1sb0 pmullwmm3,mm1;//b6: mm3=sb? *a? pandmm0,mm7;//b5: mm0=db3db2db1db0 movqmm7,[esi];//r1: mm7=src3src2src1src0 paddwmm4,mm5;//g10: mm4=sg? *a? +dg? *(1-a? ) pandmm7,MASKR;//r2: mm7=sr3sr2sr1sr0 psubsbmm2,mm1;//b5a: mm2=31-a331-a231-a131-a0 paddwmm4,FIVETWELVE;//g11: mm4=(mm4+512)green pmullwmm0,mm2;//b7: mm0=db? *(1-a? ) movqmm5,mm4;//g12: mm5=mm4green psrlwmm7,11;//r4: shiftsrcreddowntoposition0 psrlwmm4,6;//g13: mm4=mm4>>6 paddwmm4,mm5;//g14: mm4=mm4+mm5green paddwmm0,mm3;//b8: mm0=sb? *a? +db? *(1-a? ) movqmm5,[edi];//r3: mm5=dst3dst2dst1dst0 paddwmm0,SIXTEEN;//b9: mm0=(mm0+16)blue pandmm5,MASKR;//r5: mm5=dr3dr2dr1dr0 psrlwmm4,5;//g15: mm4=0? g00? g00? g00? g0green movqmm3,mm0;//b10: mm3=mm0blue psrlwmm0,5;//b11: mm0=mm0>>5blue psrlwmm5,11;//r6: shiftdstreddowntoposition0 paddwmm0,mm3;//b12: mm0=mm3+mm0blue psrlwmm0,5;//b13: mm0=000b000b000b000bblue pmullwmm7,mm1;//mm7=sr? *a? pandmm4,MASKG;//g16: mm4=00g000g000g000g0green pmullwmm5,mm2;//r7: mm5=dr? *(31-a? ) pormm0,mm4;//mm0=00gb00gb00gb00gb addeax,4;//movetonext4alphas addesi,8;//movetonext4pixelsinsrc addedi,8;//movetonext4pixelsindst movdmm1,[eax];//mm1=00000000a3a2a1a0 paddwmm5,mm7;//r8: mm5=sr? *a? +dr? *(31-a? ) paddwmm5,SIXTEEN;//r9: mm5=(mm5+16)red pxormm2,mm2;//mm2=0; movqmm7,mm5;//r10: mm7=mm5red psrlwmm5,5;//r11: mm5=mm5>>5red movqmm4,[esi];//g1: mm4=src3src2src1src0 paddwmm5,mm7;//r12: mm5=mm7+mm5red punpcklbwmm1,mm2;//mm1=00a300a200a100a0 psrlwmm5,5;//r13: mm5=mm5>>5red psllwmm5,11;//r14: mm5=mm5<<10red pormm0,mm5;//mm0=0rgb0rgb0rgb0rgb subebx,4;//polishedoff4pixels movq[edi-8],mm0;//dst=0rgb0rgb0rgb0rgb jmploopqword;//gobacktostart copyback: movq[edi],mm4;//copysourcetodestination leavefront: addedi,8;//advancedestinationby4pixels addeax,4;//advancealphaby4 addesi,8;//advancesourceby4pixels subebx,4;//decreasepixelcountby4 jmpprimeloop; checkback: testebx,0xFF;//checkif0pixelsleft jznextline;//donewiththisspan //backalign: //workoutbackendpixels movqmm5,[edi];//g2: mm5=dst3dst2dst1dst0 psrlwmm1,2;//mm1=a? >>2nukeout
- 配套讲稿:
如PPT文件的首页显示word图标,表示该PPT已包含配套word讲稿。双击word图标可打开word文档。
- 特殊限制:
部分文档作品中含有的国旗、国徽等图片,仅作为作品整体效果示例展示,禁止商用。设计者仅对作品中独创性部分享有著作权。
- 关 键 词:
- 可能是最快的算法alpha blend汇编源代码 可能 最快 算法 alpha blend 汇编 源代码