re PR libfortran/78379 (Processor-specific versions for matmul)
2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org>
PR libfortran/78379
* Makefile.am: Add generated/matmulavx128_*.c files.
Handle them for compiling and setting the right flags.
* acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
* configure.ac: Call them.
* Makefile.in: Regenerated.
* config.h.in: Regenerated.
* configure: Regenerated.
* m4/matmul.m4: Handle AMD chips by calling 128-bit AVX
versions which use FMA3 or FMA4.
* m4/matmulavx128.m4: New file.
* generated/matmul_c10.c: Regenerated.
* generated/matmul_c16.c: Regenerated.
* generated/matmul_c4.c: Regenerated.
* generated/matmul_c8.c: Regenerated.
* generated/matmul_i1.c: Regenerated.
* generated/matmul_i16.c: Regenerated.
* generated/matmul_i2.c: Regenerated.
* generated/matmul_i4.c: Regenerated.
* generated/matmul_i8.c: Regenerated.
* generated/matmul_r10.c: Regenerated.
* generated/matmul_r16.c: Regenerated.
* generated/matmul_r4.c: Regenerated.
* generated/matmul_r8.c: Regenerated.
* generated/matmulavx128_c10.c: New file.
* generated/matmulavx128_c16.c: New file.
* generated/matmulavx128_c4.c: New file.
* generated/matmulavx128_c8.c: New file.
* generated/matmulavx128_i1.c: New file.
* generated/matmulavx128_i16.c: New file.
* generated/matmulavx128_i2.c: New file.
* generated/matmulavx128_i4.c: New file.
* generated/matmulavx128_i8.c: New file.
* generated/matmulavx128_r10.c: New file.
* generated/matmulavx128_r16.c: New file.
* generated/matmulavx128_r4.c: New file.
* generated/matmulavx128_r8.c: New file.
From-SVN: r248472
This commit is contained in:
parent
87e1e6036e
commit
1d5cf7fcf2
@ -1,3 +1,43 @@
|
||||
2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org>
|
||||
|
||||
PR libfortran/78379
|
||||
* Makefile.am: Add generated/matmulavx128_*.c files.
|
||||
Handle them for compiling and setting the right flags.
|
||||
* acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
|
||||
* configure.ac: Call them.
|
||||
* Makefile.in: Regenerated.
|
||||
* config.h.in: Regenerated.
|
||||
* configure: Regenerated.
|
||||
* m4/matmul.m4: Handle AMD chips by calling 128-bit AVX
|
||||
versions which use FMA3 or FMA4.
|
||||
* m4/matmulavx128.m4: New file.
|
||||
* generated/matmul_c10.c: Regenerated.
|
||||
* generated/matmul_c16.c: Regenerated.
|
||||
* generated/matmul_c4.c: Regenerated.
|
||||
* generated/matmul_c8.c: Regenerated.
|
||||
* generated/matmul_i1.c: Regenerated.
|
||||
* generated/matmul_i16.c: Regenerated.
|
||||
* generated/matmul_i2.c: Regenerated.
|
||||
* generated/matmul_i4.c: Regenerated.
|
||||
* generated/matmul_i8.c: Regenerated.
|
||||
* generated/matmul_r10.c: Regenerated.
|
||||
* generated/matmul_r16.c: Regenerated.
|
||||
* generated/matmul_r4.c: Regenerated.
|
||||
* generated/matmul_r8.c: Regenerated.
|
||||
* generated/matmulavx128_c10.c: New file.
|
||||
* generated/matmulavx128_c16.c: New file.
|
||||
* generated/matmulavx128_c4.c: New file.
|
||||
* generated/matmulavx128_c8.c: New file.
|
||||
* generated/matmulavx128_i1.c: New file.
|
||||
* generated/matmulavx128_i16.c: New file.
|
||||
* generated/matmulavx128_i2.c: New file.
|
||||
* generated/matmulavx128_i4.c: New file.
|
||||
* generated/matmulavx128_i8.c: New file.
|
||||
* generated/matmulavx128_r10.c: New file.
|
||||
* generated/matmulavx128_r16.c: New file.
|
||||
* generated/matmulavx128_r4.c: New file.
|
||||
* generated/matmulavx128_r8.c: New file.
|
||||
|
||||
2017-05-19 Paul Thomas <pault@gcc.gnu.org>
|
||||
Jerry DeLisle <jvdelisle@gcc.gnu.org>
|
||||
|
||||
@ -14,7 +54,7 @@
|
||||
(st_endfile): Likewise.
|
||||
(st_rewind): Likewise.
|
||||
(st_flush): Likewise.
|
||||
|
||||
|
||||
2017-05-15 Jerry DeLisle <jvdelisle@gcc.gnu.org>
|
||||
|
||||
PR libgfortran/80727
|
||||
|
||||
@ -460,6 +460,21 @@ $(srcdir)/generated/matmul_c8.c \
|
||||
$(srcdir)/generated/matmul_c10.c \
|
||||
$(srcdir)/generated/matmul_c16.c
|
||||
|
||||
i_matmulavx128_c= \
|
||||
$(srcdir)/generated/matmulavx128_i1.c \
|
||||
$(srcdir)/generated/matmulavx128_i2.c \
|
||||
$(srcdir)/generated/matmulavx128_i4.c \
|
||||
$(srcdir)/generated/matmulavx128_i8.c \
|
||||
$(srcdir)/generated/matmulavx128_i16.c \
|
||||
$(srcdir)/generated/matmulavx128_r4.c \
|
||||
$(srcdir)/generated/matmulavx128_r8.c \
|
||||
$(srcdir)/generated/matmulavx128_r10.c \
|
||||
$(srcdir)/generated/matmulavx128_r16.c \
|
||||
$(srcdir)/generated/matmulavx128_c4.c \
|
||||
$(srcdir)/generated/matmulavx128_c8.c \
|
||||
$(srcdir)/generated/matmulavx128_c10.c \
|
||||
$(srcdir)/generated/matmulavx128_c16.c
|
||||
|
||||
i_matmull_c= \
|
||||
$(srcdir)/generated/matmul_l4.c \
|
||||
$(srcdir)/generated/matmul_l8.c \
|
||||
@ -641,7 +656,7 @@ gfor_built_src= $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
|
||||
$(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
|
||||
$(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
|
||||
$(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
|
||||
$(i_pow_c) $(i_pack_c) $(i_unpack_c) \
|
||||
$(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
|
||||
$(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
|
||||
$(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc
|
||||
|
||||
@ -796,7 +811,12 @@ intrinsics/dprod_r8.f90 \
|
||||
intrinsics/f2c_specifics.F90
|
||||
|
||||
# Turn on vectorization and loop unrolling for matmul.
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
|
||||
|
||||
if HAVE_AVX128
|
||||
# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
|
||||
endif
|
||||
# Logical matmul doesn't vectorize.
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops
|
||||
|
||||
@ -936,6 +956,9 @@ $(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
|
||||
$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
|
||||
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
|
||||
|
||||
$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
|
||||
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
|
||||
|
||||
$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
|
||||
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@
|
||||
|
||||
|
||||
@ -289,15 +289,20 @@ am__objects_32 = unpack_i1.lo unpack_i2.lo unpack_i4.lo unpack_i8.lo \
|
||||
unpack_i16.lo unpack_r4.lo unpack_r8.lo unpack_r10.lo \
|
||||
unpack_r16.lo unpack_c4.lo unpack_c8.lo unpack_c10.lo \
|
||||
unpack_c16.lo
|
||||
am__objects_33 = spread_i1.lo spread_i2.lo spread_i4.lo spread_i8.lo \
|
||||
am__objects_33 = matmulavx128_i1.lo matmulavx128_i2.lo \
|
||||
matmulavx128_i4.lo matmulavx128_i8.lo matmulavx128_i16.lo \
|
||||
matmulavx128_r4.lo matmulavx128_r8.lo matmulavx128_r10.lo \
|
||||
matmulavx128_r16.lo matmulavx128_c4.lo matmulavx128_c8.lo \
|
||||
matmulavx128_c10.lo matmulavx128_c16.lo
|
||||
am__objects_34 = spread_i1.lo spread_i2.lo spread_i4.lo spread_i8.lo \
|
||||
spread_i16.lo spread_r4.lo spread_r8.lo spread_r10.lo \
|
||||
spread_r16.lo spread_c4.lo spread_c8.lo spread_c10.lo \
|
||||
spread_c16.lo
|
||||
am__objects_34 = cshift0_i1.lo cshift0_i2.lo cshift0_i4.lo \
|
||||
am__objects_35 = cshift0_i1.lo cshift0_i2.lo cshift0_i4.lo \
|
||||
cshift0_i8.lo cshift0_i16.lo cshift0_r4.lo cshift0_r8.lo \
|
||||
cshift0_r10.lo cshift0_r16.lo cshift0_c4.lo cshift0_c8.lo \
|
||||
cshift0_c10.lo cshift0_c16.lo
|
||||
am__objects_35 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
|
||||
am__objects_36 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
|
||||
$(am__objects_7) $(am__objects_8) $(am__objects_9) \
|
||||
$(am__objects_10) $(am__objects_11) $(am__objects_12) \
|
||||
$(am__objects_13) $(am__objects_14) $(am__objects_15) \
|
||||
@ -307,14 +312,14 @@ am__objects_35 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
|
||||
$(am__objects_25) $(am__objects_26) $(am__objects_27) \
|
||||
$(am__objects_28) $(am__objects_29) $(am__objects_30) \
|
||||
$(am__objects_31) $(am__objects_32) $(am__objects_33) \
|
||||
$(am__objects_34)
|
||||
@LIBGFOR_MINIMAL_FALSE@am__objects_36 = close.lo file_pos.lo format.lo \
|
||||
$(am__objects_34) $(am__objects_35)
|
||||
@LIBGFOR_MINIMAL_FALSE@am__objects_37 = close.lo file_pos.lo format.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ inquire.lo intrinsics.lo list_read.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ lock.lo open.lo read.lo transfer.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ transfer128.lo unit.lo unix.lo write.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ fbuf.lo
|
||||
am__objects_37 = size_from_kind.lo $(am__objects_36)
|
||||
@LIBGFOR_MINIMAL_FALSE@am__objects_38 = access.lo c99_functions.lo \
|
||||
am__objects_38 = size_from_kind.lo $(am__objects_37)
|
||||
@LIBGFOR_MINIMAL_FALSE@am__objects_39 = access.lo c99_functions.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ chdir.lo chmod.lo clock.lo cpu_time.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ ctime.lo date_and_time.lo dtime.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ env.lo etime.lo execute_command_line.lo \
|
||||
@ -324,19 +329,19 @@ am__objects_37 = size_from_kind.lo $(am__objects_36)
|
||||
@LIBGFOR_MINIMAL_FALSE@ rename.lo stat.lo symlnk.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ system_clock.lo time.lo umask.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ unlink.lo
|
||||
@IEEE_SUPPORT_TRUE@am__objects_39 = ieee_helper.lo
|
||||
am__objects_40 = associated.lo abort.lo args.lo cshift0.lo eoshift0.lo \
|
||||
@IEEE_SUPPORT_TRUE@am__objects_40 = ieee_helper.lo
|
||||
am__objects_41 = associated.lo abort.lo args.lo cshift0.lo eoshift0.lo \
|
||||
eoshift2.lo erfc_scaled.lo extends_type_of.lo fnum.lo \
|
||||
ierrno.lo ishftc.lo mvbits.lo move_alloc.lo pack_generic.lo \
|
||||
selected_char_kind.lo size.lo spread_generic.lo \
|
||||
string_intrinsics.lo rand.lo random.lo reshape_generic.lo \
|
||||
reshape_packed.lo selected_int_kind.lo selected_real_kind.lo \
|
||||
unpack_generic.lo in_pack_generic.lo in_unpack_generic.lo \
|
||||
$(am__objects_38) $(am__objects_39)
|
||||
@IEEE_SUPPORT_TRUE@am__objects_41 = ieee_arithmetic.lo \
|
||||
$(am__objects_39) $(am__objects_40)
|
||||
@IEEE_SUPPORT_TRUE@am__objects_42 = ieee_arithmetic.lo \
|
||||
@IEEE_SUPPORT_TRUE@ ieee_exceptions.lo ieee_features.lo
|
||||
am__objects_42 =
|
||||
am__objects_43 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
|
||||
am__objects_43 =
|
||||
am__objects_44 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
|
||||
_abs_i4.lo _abs_i8.lo _abs_i16.lo _abs_r4.lo _abs_r8.lo \
|
||||
_abs_r10.lo _abs_r16.lo _aimag_c4.lo _aimag_c8.lo \
|
||||
_aimag_c10.lo _aimag_c16.lo _exp_r4.lo _exp_r8.lo _exp_r10.lo \
|
||||
@ -360,19 +365,19 @@ am__objects_43 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
|
||||
_conjg_c4.lo _conjg_c8.lo _conjg_c10.lo _conjg_c16.lo \
|
||||
_aint_r4.lo _aint_r8.lo _aint_r10.lo _aint_r16.lo _anint_r4.lo \
|
||||
_anint_r8.lo _anint_r10.lo _anint_r16.lo
|
||||
am__objects_44 = _sign_i4.lo _sign_i8.lo _sign_i16.lo _sign_r4.lo \
|
||||
am__objects_45 = _sign_i4.lo _sign_i8.lo _sign_i16.lo _sign_r4.lo \
|
||||
_sign_r8.lo _sign_r10.lo _sign_r16.lo _dim_i4.lo _dim_i8.lo \
|
||||
_dim_i16.lo _dim_r4.lo _dim_r8.lo _dim_r10.lo _dim_r16.lo \
|
||||
_atan2_r4.lo _atan2_r8.lo _atan2_r10.lo _atan2_r16.lo \
|
||||
_mod_i4.lo _mod_i8.lo _mod_i16.lo _mod_r4.lo _mod_r8.lo \
|
||||
_mod_r10.lo _mod_r16.lo
|
||||
am__objects_45 = misc_specifics.lo
|
||||
am__objects_46 = $(am__objects_43) $(am__objects_44) $(am__objects_45) \
|
||||
am__objects_46 = misc_specifics.lo
|
||||
am__objects_47 = $(am__objects_44) $(am__objects_45) $(am__objects_46) \
|
||||
dprod_r8.lo f2c_specifics.lo
|
||||
am__objects_47 = $(am__objects_3) $(am__objects_35) $(am__objects_37) \
|
||||
$(am__objects_40) $(am__objects_41) $(am__objects_42) \
|
||||
$(am__objects_46)
|
||||
@onestep_FALSE@am_libgfortran_la_OBJECTS = $(am__objects_47)
|
||||
am__objects_48 = $(am__objects_3) $(am__objects_36) $(am__objects_38) \
|
||||
$(am__objects_41) $(am__objects_42) $(am__objects_43) \
|
||||
$(am__objects_47)
|
||||
@onestep_FALSE@am_libgfortran_la_OBJECTS = $(am__objects_48)
|
||||
@onestep_TRUE@am_libgfortran_la_OBJECTS = libgfortran_c.lo
|
||||
libgfortran_la_OBJECTS = $(am_libgfortran_la_OBJECTS)
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@
|
||||
@ -879,6 +884,21 @@ $(srcdir)/generated/matmul_c8.c \
|
||||
$(srcdir)/generated/matmul_c10.c \
|
||||
$(srcdir)/generated/matmul_c16.c
|
||||
|
||||
i_matmulavx128_c = \
|
||||
$(srcdir)/generated/matmulavx128_i1.c \
|
||||
$(srcdir)/generated/matmulavx128_i2.c \
|
||||
$(srcdir)/generated/matmulavx128_i4.c \
|
||||
$(srcdir)/generated/matmulavx128_i8.c \
|
||||
$(srcdir)/generated/matmulavx128_i16.c \
|
||||
$(srcdir)/generated/matmulavx128_r4.c \
|
||||
$(srcdir)/generated/matmulavx128_r8.c \
|
||||
$(srcdir)/generated/matmulavx128_r10.c \
|
||||
$(srcdir)/generated/matmulavx128_r16.c \
|
||||
$(srcdir)/generated/matmulavx128_c4.c \
|
||||
$(srcdir)/generated/matmulavx128_c8.c \
|
||||
$(srcdir)/generated/matmulavx128_c10.c \
|
||||
$(srcdir)/generated/matmulavx128_c16.c
|
||||
|
||||
i_matmull_c = \
|
||||
$(srcdir)/generated/matmul_l4.c \
|
||||
$(srcdir)/generated/matmul_l8.c \
|
||||
@ -1059,7 +1079,7 @@ gfor_built_src = $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
|
||||
$(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
|
||||
$(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
|
||||
$(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
|
||||
$(i_pow_c) $(i_pack_c) $(i_unpack_c) \
|
||||
$(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
|
||||
$(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
|
||||
$(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc
|
||||
|
||||
@ -1518,6 +1538,19 @@ distclean-compile:
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r16.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r4.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r8.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c10.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c16.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c4.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c8.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i1.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i16.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i2.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i4.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i8.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r10.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r16.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r4.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r8.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i1.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i16.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i2.Plo@am__quote@
|
||||
@ -4584,6 +4617,97 @@ unpack_c16.lo: $(srcdir)/generated/unpack_c16.c
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o unpack_c16.lo `test -f '$(srcdir)/generated/unpack_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/unpack_c16.c
|
||||
|
||||
matmulavx128_i1.lo: $(srcdir)/generated/matmulavx128_i1.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i1.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i1.Tpo -c -o matmulavx128_i1.lo `test -f '$(srcdir)/generated/matmulavx128_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i1.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i1.Tpo $(DEPDIR)/matmulavx128_i1.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i1.c' object='matmulavx128_i1.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i1.lo `test -f '$(srcdir)/generated/matmulavx128_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i1.c
|
||||
|
||||
matmulavx128_i2.lo: $(srcdir)/generated/matmulavx128_i2.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i2.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i2.Tpo -c -o matmulavx128_i2.lo `test -f '$(srcdir)/generated/matmulavx128_i2.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i2.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i2.Tpo $(DEPDIR)/matmulavx128_i2.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i2.c' object='matmulavx128_i2.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i2.lo `test -f '$(srcdir)/generated/matmulavx128_i2.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i2.c
|
||||
|
||||
matmulavx128_i4.lo: $(srcdir)/generated/matmulavx128_i4.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i4.Tpo -c -o matmulavx128_i4.lo `test -f '$(srcdir)/generated/matmulavx128_i4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i4.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i4.Tpo $(DEPDIR)/matmulavx128_i4.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i4.c' object='matmulavx128_i4.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i4.lo `test -f '$(srcdir)/generated/matmulavx128_i4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i4.c
|
||||
|
||||
matmulavx128_i8.lo: $(srcdir)/generated/matmulavx128_i8.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i8.Tpo -c -o matmulavx128_i8.lo `test -f '$(srcdir)/generated/matmulavx128_i8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i8.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i8.Tpo $(DEPDIR)/matmulavx128_i8.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i8.c' object='matmulavx128_i8.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i8.lo `test -f '$(srcdir)/generated/matmulavx128_i8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i8.c
|
||||
|
||||
matmulavx128_i16.lo: $(srcdir)/generated/matmulavx128_i16.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i16.Tpo -c -o matmulavx128_i16.lo `test -f '$(srcdir)/generated/matmulavx128_i16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i16.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i16.Tpo $(DEPDIR)/matmulavx128_i16.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i16.c' object='matmulavx128_i16.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i16.lo `test -f '$(srcdir)/generated/matmulavx128_i16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i16.c
|
||||
|
||||
matmulavx128_r4.lo: $(srcdir)/generated/matmulavx128_r4.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r4.Tpo -c -o matmulavx128_r4.lo `test -f '$(srcdir)/generated/matmulavx128_r4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r4.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r4.Tpo $(DEPDIR)/matmulavx128_r4.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r4.c' object='matmulavx128_r4.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r4.lo `test -f '$(srcdir)/generated/matmulavx128_r4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r4.c
|
||||
|
||||
matmulavx128_r8.lo: $(srcdir)/generated/matmulavx128_r8.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r8.Tpo -c -o matmulavx128_r8.lo `test -f '$(srcdir)/generated/matmulavx128_r8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r8.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r8.Tpo $(DEPDIR)/matmulavx128_r8.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r8.c' object='matmulavx128_r8.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r8.lo `test -f '$(srcdir)/generated/matmulavx128_r8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r8.c
|
||||
|
||||
matmulavx128_r10.lo: $(srcdir)/generated/matmulavx128_r10.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r10.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r10.Tpo -c -o matmulavx128_r10.lo `test -f '$(srcdir)/generated/matmulavx128_r10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r10.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r10.Tpo $(DEPDIR)/matmulavx128_r10.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r10.c' object='matmulavx128_r10.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r10.lo `test -f '$(srcdir)/generated/matmulavx128_r10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r10.c
|
||||
|
||||
matmulavx128_r16.lo: $(srcdir)/generated/matmulavx128_r16.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r16.Tpo -c -o matmulavx128_r16.lo `test -f '$(srcdir)/generated/matmulavx128_r16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r16.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r16.Tpo $(DEPDIR)/matmulavx128_r16.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r16.c' object='matmulavx128_r16.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r16.lo `test -f '$(srcdir)/generated/matmulavx128_r16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r16.c
|
||||
|
||||
matmulavx128_c4.lo: $(srcdir)/generated/matmulavx128_c4.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c4.Tpo -c -o matmulavx128_c4.lo `test -f '$(srcdir)/generated/matmulavx128_c4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c4.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c4.Tpo $(DEPDIR)/matmulavx128_c4.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c4.c' object='matmulavx128_c4.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c4.lo `test -f '$(srcdir)/generated/matmulavx128_c4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c4.c
|
||||
|
||||
matmulavx128_c8.lo: $(srcdir)/generated/matmulavx128_c8.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c8.Tpo -c -o matmulavx128_c8.lo `test -f '$(srcdir)/generated/matmulavx128_c8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c8.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c8.Tpo $(DEPDIR)/matmulavx128_c8.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c8.c' object='matmulavx128_c8.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c8.lo `test -f '$(srcdir)/generated/matmulavx128_c8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c8.c
|
||||
|
||||
matmulavx128_c10.lo: $(srcdir)/generated/matmulavx128_c10.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c10.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c10.Tpo -c -o matmulavx128_c10.lo `test -f '$(srcdir)/generated/matmulavx128_c10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c10.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c10.Tpo $(DEPDIR)/matmulavx128_c10.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c10.c' object='matmulavx128_c10.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c10.lo `test -f '$(srcdir)/generated/matmulavx128_c10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c10.c
|
||||
|
||||
matmulavx128_c16.lo: $(srcdir)/generated/matmulavx128_c16.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c16.Tpo -c -o matmulavx128_c16.lo `test -f '$(srcdir)/generated/matmulavx128_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c16.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c16.Tpo $(DEPDIR)/matmulavx128_c16.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c16.c' object='matmulavx128_c16.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c16.lo `test -f '$(srcdir)/generated/matmulavx128_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c16.c
|
||||
|
||||
spread_i1.lo: $(srcdir)/generated/spread_i1.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT spread_i1.lo -MD -MP -MF $(DEPDIR)/spread_i1.Tpo -c -o spread_i1.lo `test -f '$(srcdir)/generated/spread_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/spread_i1.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/spread_i1.Tpo $(DEPDIR)/spread_i1.Plo
|
||||
@ -5567,7 +5691,10 @@ uninstall-am: uninstall-cafexeclibLTLIBRARIES \
|
||||
@LIBGFOR_USE_SYMVER_SUN_TRUE@@LIBGFOR_USE_SYMVER_TRUE@ > $@ || (rm -f $@ ; exit 1)
|
||||
|
||||
# Turn on vectorization and loop unrolling for matmul.
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
|
||||
|
||||
# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
|
||||
@HAVE_AVX128_TRUE@$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
|
||||
# Logical matmul doesn't vectorize.
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops
|
||||
|
||||
@ -5667,6 +5794,9 @@ fpu-target.inc: fpu-target.h $(srcdir)/libgfortran.h
|
||||
@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
|
||||
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
|
||||
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
|
||||
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@
|
||||
|
||||
|
||||
@ -452,3 +452,53 @@ AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
|
||||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
||||
dnl Check for FMA3
|
||||
dnl
|
||||
AC_DEFUN([LIBGFOR_CHECK_FMA3], [
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mfma -mno-fma4"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
float
|
||||
flt_mul_add (float a, float b, float c)
|
||||
{
|
||||
return __builtin_fmaf (a, b, c);
|
||||
}]], [[]])],
|
||||
AC_DEFINE(HAVE_FMA3, 1,
|
||||
[Define if FMA3 instructions can be compiled.]),
|
||||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
||||
dnl Check for FMA4
|
||||
dnl
|
||||
AC_DEFUN([LIBGFOR_CHECK_FMA4], [
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mfma4 -mno-fma"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
float
|
||||
flt_mul_add (float a, float b, float c)
|
||||
{
|
||||
return __builtin_fmaf (a, b, c);
|
||||
}]], [[]])],
|
||||
AC_DEFINE(HAVE_FMA4, 1,
|
||||
[Define if FMA4 instructions can be compiled.]),
|
||||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
||||
dnl Check for -mprefer-avx128
|
||||
dnl This also defines an automake conditional.
|
||||
AC_DEFUN([LIBGFOR_CHECK_AVX128], [
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mavx -mprefer-avx128"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
void foo()
|
||||
{
|
||||
}]], [[]])],
|
||||
AC_DEFINE(HAVE_AVX128, 1,
|
||||
[Define if -mprefer-avx128 is supported.])
|
||||
AM_CONDITIONAL([HAVE_AVX128],true),
|
||||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
||||
@ -81,6 +81,9 @@
|
||||
/* Define if AVX instructions can be compiled. */
|
||||
#undef HAVE_AVX
|
||||
|
||||
/* Define if -mprefer-avx128 is supported. */
|
||||
#undef HAVE_AVX128
|
||||
|
||||
/* Define if AVX2 instructions can be compiled. */
|
||||
#undef HAVE_AVX2
|
||||
|
||||
@ -375,6 +378,12 @@
|
||||
/* Define to 1 if you have the `floorl' function. */
|
||||
#undef HAVE_FLOORL
|
||||
|
||||
/* Define if FMA3 instructions can be compiled. */
|
||||
#undef HAVE_FMA3
|
||||
|
||||
/* Define if FMA4 instructions can be compiled. */
|
||||
#undef HAVE_FMA4
|
||||
|
||||
/* Define to 1 if you have the `fmod' function. */
|
||||
#undef HAVE_FMOD
|
||||
|
||||
|
||||
103
libgfortran/configure
vendored
103
libgfortran/configure
vendored
@ -606,6 +606,8 @@ am__EXEEXT_TRUE
|
||||
LTLIBOBJS
|
||||
LIBOBJS
|
||||
get_gcc_base_ver
|
||||
HAVE_AVX128_FALSE
|
||||
HAVE_AVX128_TRUE
|
||||
IEEE_FLAGS
|
||||
IEEE_SUPPORT
|
||||
IEEE_SUPPORT_FALSE
|
||||
@ -12421,7 +12423,7 @@ else
|
||||
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
|
||||
lt_status=$lt_dlunknown
|
||||
cat > conftest.$ac_ext <<_LT_EOF
|
||||
#line 12424 "configure"
|
||||
#line 12426 "configure"
|
||||
#include "confdefs.h"
|
||||
|
||||
#if HAVE_DLFCN_H
|
||||
@ -12527,7 +12529,7 @@ else
|
||||
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
|
||||
lt_status=$lt_dlunknown
|
||||
cat > conftest.$ac_ext <<_LT_EOF
|
||||
#line 12530 "configure"
|
||||
#line 12532 "configure"
|
||||
#include "confdefs.h"
|
||||
|
||||
#if HAVE_DLFCN_H
|
||||
@ -26363,6 +26365,99 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
# Check for FMA3 extensions
|
||||
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mfma -mno-fma4"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
float
|
||||
flt_mul_add (float a, float b, float c)
|
||||
{
|
||||
return __builtin_fmaf (a, b, c);
|
||||
}
|
||||
int
|
||||
main ()
|
||||
{
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"; then :
|
||||
|
||||
$as_echo "#define HAVE_FMA3 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
# Check for FMA4 extensions
|
||||
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mfma4 -mno-fma"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
float
|
||||
flt_mul_add (float a, float b, float c)
|
||||
{
|
||||
return __builtin_fmaf (a, b, c);
|
||||
}
|
||||
int
|
||||
main ()
|
||||
{
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"; then :
|
||||
|
||||
$as_echo "#define HAVE_FMA4 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
# Check if AVX128 works
|
||||
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mavx -mprefer-avx128"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
void foo()
|
||||
{
|
||||
}
|
||||
int
|
||||
main ()
|
||||
{
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"; then :
|
||||
|
||||
$as_echo "#define HAVE_AVX128 1" >>confdefs.h
|
||||
|
||||
if true; then
|
||||
HAVE_AVX128_TRUE=
|
||||
HAVE_AVX128_FALSE='#'
|
||||
else
|
||||
HAVE_AVX128_TRUE='#'
|
||||
HAVE_AVX128_FALSE=
|
||||
fi
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
# Determine what GCC version number to use in filesystem paths.
|
||||
|
||||
get_gcc_base_ver="cat"
|
||||
@ -26615,6 +26710,10 @@ if test -z "${IEEE_SUPPORT_TRUE}" && test -z "${IEEE_SUPPORT_FALSE}"; then
|
||||
as_fn_error "conditional \"IEEE_SUPPORT\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${HAVE_AVX128_TRUE}" && test -z "${HAVE_AVX128_FALSE}"; then
|
||||
as_fn_error "conditional \"HAVE_AVX128\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
|
||||
: ${CONFIG_STATUS=./config.status}
|
||||
ac_write_fail=0
|
||||
|
||||
@ -624,6 +624,15 @@ LIBGFOR_CHECK_AVX2
|
||||
# Check wether we support AVX512f extensions
|
||||
LIBGFOR_CHECK_AVX512F
|
||||
|
||||
# Check for FMA3 extensions
|
||||
LIBGFOR_CHECK_FMA3
|
||||
|
||||
# Check for FMA4 extensions
|
||||
LIBGFOR_CHECK_FMA4
|
||||
|
||||
# Check if AVX128 works
|
||||
LIBGFOR_CHECK_AVX128
|
||||
|
||||
# Determine what GCC version number to use in filesystem paths.
|
||||
GCC_BASE_VER
|
||||
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
|
||||
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_c10_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
|
||||
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_c10_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_c10_vanilla (gfc_array_c10 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_c10_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_c10_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
|
||||
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_c16_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
|
||||
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_c16_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_c16_vanilla (gfc_array_c16 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_c16_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_c16_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
|
||||
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_c4_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
|
||||
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_c4_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_c4_vanilla (gfc_array_c4 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_c4_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_c4_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
|
||||
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_c8_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
|
||||
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_c8_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_c8_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_c8_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
|
||||
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_i1_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
|
||||
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_i1_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_i1_vanilla (gfc_array_i1 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_i1 (gfc_array_i1 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_i1_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_i1_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray,
|
||||
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_i16_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray,
|
||||
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_i16_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_i16_vanilla (gfc_array_i16 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_i16_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_i16_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
|
||||
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_i2_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
|
||||
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_i2_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_i2_vanilla (gfc_array_i2 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_i2 (gfc_array_i2 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_i2_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_i2_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray,
|
||||
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_i4_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray,
|
||||
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_i4_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_i4_vanilla (gfc_array_i4 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_i4_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_i4_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray,
|
||||
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_i8_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray,
|
||||
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_i8_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_i8_vanilla (gfc_array_i8 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_i8_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_i8_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
|
||||
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_r10_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
|
||||
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_r10_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_r10_vanilla (gfc_array_r10 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_r10_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_r10_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
|
||||
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_r16_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
|
||||
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_r16_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_r16_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_r16_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
|
||||
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_r4_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
|
||||
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_r4_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_r4_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_r4_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
@ -1734,6 +1734,24 @@ matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
|
||||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
|
||||
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_r8_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
|
||||
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_r8_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_r8_vanilla (gfc_array_r8 * const restrict retarray,
|
||||
@ -2332,6 +2350,26 @@ void matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_r8_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_r8_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
1152
libgfortran/generated/matmulavx128_c10.c
Normal file
1152
libgfortran/generated/matmulavx128_c10.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_c16.c
Normal file
1152
libgfortran/generated/matmulavx128_c16.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_c4.c
Normal file
1152
libgfortran/generated/matmulavx128_c4.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_c8.c
Normal file
1152
libgfortran/generated/matmulavx128_c8.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_i1.c
Normal file
1152
libgfortran/generated/matmulavx128_i1.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_i16.c
Normal file
1152
libgfortran/generated/matmulavx128_i16.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_i2.c
Normal file
1152
libgfortran/generated/matmulavx128_i2.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_i4.c
Normal file
1152
libgfortran/generated/matmulavx128_i4.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_i8.c
Normal file
1152
libgfortran/generated/matmulavx128_i8.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_r10.c
Normal file
1152
libgfortran/generated/matmulavx128_r10.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_r16.c
Normal file
1152
libgfortran/generated/matmulavx128_r16.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_r4.c
Normal file
1152
libgfortran/generated/matmulavx128_r4.c
Normal file
File diff suppressed because it is too large
Load Diff
1152
libgfortran/generated/matmulavx128_r8.c
Normal file
1152
libgfortran/generated/matmulavx128_r8.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -106,6 +106,26 @@ static' include(matmul_internal.m4)dnl
|
||||
static' include(matmul_internal.m4)dnl
|
||||
`#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
|
||||
`void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto('matmul_name`);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
|
||||
`void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto('matmul_name`);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
|
||||
`static' include(matmul_internal.m4)dnl
|
||||
@ -161,6 +181,26 @@ void matmul_'rtype_code` ('rtype` * const restrict retarray,
|
||||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_'rtype_code`_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_'rtype_code`_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
67
libgfortran/m4/matmulavx128.m4
Normal file
67
libgfortran/m4/matmulavx128.m4
Normal file
@ -0,0 +1,67 @@
|
||||
`/* Implementation of the MATMUL intrinsic
|
||||
Copyright (C) 2002-2017 Free Software Foundation, Inc.
|
||||
Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
|
||||
|
||||
This file is part of the GNU Fortran runtime library (libgfortran).
|
||||
|
||||
Libgfortran is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 3 of the License, or (at your option) any later version.
|
||||
|
||||
Libgfortran is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
Under Section 7 of GPL version 3, you are granted additional
|
||||
permissions described in the GCC Runtime Library Exception, version
|
||||
3.1, as published by the Free Software Foundation.
|
||||
|
||||
You should have received a copy of the GNU General Public License and
|
||||
a copy of the GCC Runtime Library Exception along with this program;
|
||||
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "libgfortran.h"
|
||||
#include <string.h>
|
||||
#include <assert.h>'
|
||||
|
||||
include(iparm.m4)dnl
|
||||
|
||||
/* These are the specific versions of matmul with -mprefer-avx128. */
|
||||
|
||||
`#if defined (HAVE_'rtype_name`)
|
||||
|
||||
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||
passed to us by the front-end, in which case we call it for large
|
||||
matrices. */
|
||||
|
||||
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||
const int *, const 'rtype_name` *, const 'rtype_name` *,
|
||||
const int *, const 'rtype_name` *, const int *,
|
||||
const 'rtype_name` *, 'rtype_name` *, const int *,
|
||||
int, int);
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
|
||||
`void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto('matmul_name`);
|
||||
'include(matmul_internal.m4)dnl
|
||||
`#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
|
||||
`void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto('matmul_name`);
|
||||
'include(matmul_internal.m4)dnl
|
||||
`#endif
|
||||
|
||||
#endif
|
||||
'
|
||||
Loading…
Reference in New Issue
Block a user