当前位置：首页 > 32位浮点数乘法汇编算法

32位浮点数乘法汇编算法

2024-10-25 17:45:13

32位浮点数乘法汇编算法

写汇编的基本已经绝迹了现在,就老一辈的有些在用

#include <stdio.h

#include <sys/time.h

#define INIT_TIMER_VALIABLE \\

struct timeval tpstart,tpend; \\

float timeuse;

#define START_TIMER gettimeofday(&tpstart,NULL);

#define END_PRINTF_TIMER(name) \\

gettimeofday(&tpend,NULL); \\

timeuse=(tpend.tv_sec*1000*1000+tpend.tv_usec)-(tpstart.tv_sec*1000*1000+tpstart.tv_usec); \\

printf("func :%s:time use(us) %f\",name,timeuse);

float vfp_operate(float f1, float f2)

{

float sum=0;

__asm__ __volatile__(

"vmov s1, %1\"

"vmov s2, %2\"

"fmuls s0, s1, s2\"

"vmov %0, s0\"

:"=r"(sum)

:"r"(f1),"r"(f2)

);

return sum;

}

int main()

{

float f1,f2;

float result;

INIT_TIMER_VALIABLE

printf("input float data1:");

scanf("%f",&f1);

printf("input float data2:");

scanf("%f",&f2);

START_TIMER

result=f1*f2;

END_PRINTF_TIMER("use system function")

printf("result is %f\",result);

START_TIMER

result=vfp_operate(f1,f2);

END_PRINTF_TIMER("use vfp_operate")

printf("result is %f\",result);

}

测试结果：

./vfp_helloworld

input float data1:0.125

input float data2:1.684

func :use system function:time use(us) 12.000000

result is 0.210500

func :use system function:time use(us) 5.000000

result is 0.210500

测试使用编译选项：

源代码：

#include <stdio.h

int main()

{

float f1,f2;

printf("input float data1:");

scanf("%f",&f1);

printf("input float data2:");

scanf("%f",&f2);

printf("float %f x %f =%f\", f1,f2,f1*f2);

}

编译选项：(只是多增加了-mfloat-abi=softfp 测试发现=sofltfp和=hard编译出的汇编一样)

arm-none-linux-gnueabi-gcc vfp_helloworld.c -S arm-none-linux-gnueabi-gcc vfp_helloworld.c -S -o vfp_helloworld.asm

-mfloat-abi=softfp -o vfp_helloworld.asm

.cpu arm10tdmi .cpu arm10tdmi

.eabi_attribute 27, 3 .fpu softvfp

.fpu vfp .eabi_attribute 20, 1

.eabi_attribute 20, 1 .eabi_attribute 21, 1

.eabi_attribute 21, 1 .eabi_attribute 23, 3

.eabi_attribute 23, 3 .eabi_attribute 24, 1

.eabi_attribute 24, 1 .eabi_attribute 25, 1

.eabi_attribute 25, 1 .eabi_attribute 26, 2

.eabi_attribute 26, 2 .eabi_attribute 30, 6

.eabi_attribute 30, 6 .eabi_attribute 18, 4

.eabi_attribute 18, 4 .file"vfp_helloworld.c"

.file"vfp_helloworld.c" .section.rodata

.section.rodata .align2

.align2 .LC0:

.LC0: .ascii"input float data1:\\000"

.ascii"input float data1:\\000" .align2

.align2 .LC1:

.LC1: .ascii"%f\\000"

.ascii"%f\\000" .align2

.align2 .LC2:

.LC2: .ascii"input float data2:\\000"

.ascii"input float data2:\\000" .global__aeabi_f2d

.align2 .global__aeabi_fmul <<<------这里说明软件浮点运算。

.LC3: .align2

.ascii"float %f x %f =%f\\012\\000" .LC3:

.text .ascii"float %f x %f =%f\\012\\000"

.align2 .text

.globalmain .align2

.typemain, %function .globalmain

main: .typemain, %function

.fnstart main:

.LFB2: .fnstart

@ args = 0, pretend = 0, frame = 16 .LFB2:

@ frame_needed = 1, uses_anonymous_args = 0 @ args = 0, pretend = 0, frame = 16

stmfdsp!, {fp, lr} @ frame_needed = 1, uses_anonymous_args = 0

.save {fp, lr} stmfdsp!, {r4, r5, r6, r7, r8, fp, lr}

.LCFI0: .save {r4, r5, r6, r7, r8, fp, lr}

.setfp fp, sp, #4 .LCFI0:

addfp, sp, #4 .setfp fp, sp, #24

.LCFI1: addfp, sp, #24

.pad #32 .LCFI1:

subsp, sp, #32 .pad #36

.LCFI2: subsp, sp, #36

ldrr0, .L3 .LCFI2:

blprintf ldrr0, .L3

subr3, fp, #8 blprintf

ldrr0, .L3+4 subr3, fp, #32

movr1, r3 ldrr0, .L3+4

blscanf movr1, r3

ldrr0, .L3+8 blscanf

blprintf ldrr0, .L3+8

subr3, fp, #12 blprintf

ldrr0, .L3+4 subr3, fp, #36

movr1, r3 ldrr0, .L3+4

blscanf movr1, r3

fldss15, [fp, #-8] blscanf

fcvtdsd5, s15 ldrr3, [fp, #-32]@ float

fldss15, [fp, #-12] movr0, r3

fcvtdsd6, s15 bl__aeabi_f2d

fldss14, [fp, #-8] movr5, r0

fldss15, [fp, #-12] movr6, r1

fmulss15, s14, s15 <<<-----直接使用硬件浮点指令 ldrr3, [fp, #-36]@ float

fcvtdsd7, s15 movr0, r3

fstdd6, [sp, #0] bl__aeabi_f2d

fstdd7, [sp, #8] movr7, r0

ldrr0, .L3+12 movr8, r1

fmrrdr2, r3, d5 ldrr3, [fp, #-32]@ float

blprintf ldrr2, [fp, #-36]@ float

subsp, fp, #4 movr0, r3

ldmfdsp!, {fp, pc} movr1, r2

.L4: bl__aeabi_fmul <<<------这里调用软件浮点运算。

.align2 movr3, r0

.L3: movr0, r3

.word.LC0 bl__aeabi_f2d

.word.LC1 movr3, r0

.word.LC2 movr4, r1

.word.LC3 stmiasp, {r7-r8}

.LFE2: strr3, [sp, #8]

.fnend strr4, [sp, #12]

.sizemain, .-main ldrr0, .L3+12

.ident"GCC: (Sourcery G++ Lite 2009q1-203) 4.3.3" movr2, r5

.section.note.GNU-stack,"",%progbits movr3, r6

blprintf

subsp, fp, #24

ldmfdsp!, {r4, r5, r6, r7, r8, fp, pc}

.L4:

.align2

.L3:

.word.LC0

.word.LC1

.word.LC2

.word.LC3

.LFE2:

.fnend

.sizemain, .-main

.ident"GCC: (Sourcery G++ Lite 2009q1-203) 4.3.3"

.section.note.GNU-stack,"",%progbits

摘自：RM的pdf文档的说明

浮点运算的支持

ARM 处理器内核不包含浮点硬件。必须使用以下两种方法之一，另行提供对浮点算法的支持：

在软件中，使用浮点库 fplib。此库提供了执行浮点运算可以调用的函数，无需额外的硬件。请参阅《库指南》中第 4-2 页的软件浮点库 fplib。

在硬件中，使用含 VFP 硬件协处理器的 ARM 处理器内核来进行所需的浮点运算。 VFP 是执行 IEEE 浮点的协处理器体系结构，支持单精度和双精度，但不支持扩展精度。

Note

在实际编程中，VFP 中的浮点运算实际是组合使用硬件（执行常见的情况）和软件（处理不常见的情况和导致异常的情况）执行的。请参阅VFP 支持。

Example 5.2 是一个用 C 执行浮点算法的函数，用以说明浮点算法的软件和硬件支持的不同。

Example 5.2. 浮点运算

float foo(float num1, float num2)

{

float temp, temp2;

temp = num1 + num2;

temp2 = num2 * num2;

return temp2-temp;

}

如果使用命令行选项 --cpu 5TE --fpu softvfp 编译Example 5.2 的 C 代码，则编译器生成的机器代码的反汇编如Example 5.3 所示。在本示例中，在软件中通过调用库例程（如 __aeabi_fmul）来执行浮点算法。

Example 5.3. 软件中对浮点运算的支持

||foo|| PROC

PUSH {r4-r6, lr}

MOV r4, r1

BL __aeabi_fadd <<<<----直接向加

MOV r5, r0

MOV r1, r4

MOV r0, r4

BL __aeabi_fmul <<<<<----然后相乘

MOV r1, r5

POP {r4-r6, lr}

B __aeabi_fsub

ENDP

如果使用命令行选项 --fpu vfp 编译Example 5.2 的 C 代码，则编译器生成的机器代码的反汇编如Example 5.4 所示。在本示例中，在硬件中通过浮点算法指令（如 VMUL.F32）来执行浮点算法。

Example 5.4. 硬件中对浮点运算的支持

||foo|| PROC

VADD.F32 s2, s0, s1

VMUL.F32 s0, s1, s1

VSUB.F32 s0, s0, s2

BX lr

ENDP

在实际编程中，使用硬件支持浮点算法的代码更为紧凑，并提供比在软件中执行浮点算法的代码更佳的性能。但是，浮点算法的硬件支持需要 VFP 协处理器。

缺省情况下，如果有 VFP 协处理器，则会生成 VFP 指令。如果没有 VFP 协处理器，则编译器会生成调用软件浮点库 fplib 的代码，用于执行浮点运算。fplib 是 C 库 RealView Development Suite 标准分发的组成部分。

32位浮点数乘法汇编算法

相关文章

热门推荐