|
1 | 1 | /*******************************************************************************
|
2 | 2 | * Copyright 2019-2023 Intel Corporation
|
3 |
| -* Copyright 2021-2023 FUJITSU LIMITED |
| 3 | +* Copyright 2021-2024 FUJITSU LIMITED |
4 | 4 | * Copyright 2022 Arm Ltd. and affiliates
|
5 | 5 | *
|
6 | 6 | * Licensed under the Apache License, Version 2.0 (the "License");
|
@@ -918,10 +918,87 @@ void jit_uni_eltwise_injector_f32<isa>::log_compute_vector_fwd(
|
918 | 918 | }
|
919 | 919 | h->L(exitL);
|
920 | 920 | }
|
| 921 | +template <cpu_isa_t isa> |
| 922 | +void jit_uni_eltwise_injector_f32< |
| 923 | + isa>::gelu_erf_minimax_approx_compute_vector_fwd(const TRegS &vmm_src) { |
| 924 | + if (isa != sve_512) { // TODO: change this condition based on cpu id. |
| 925 | + return; |
| 926 | + } |
| 927 | + |
| 928 | + // register mapping |
| 929 | + TRegS vmm_pol = vmm_aux0; |
| 930 | + TRegS vmm_src_pos = vmm_aux1; |
| 931 | + TRegS vmm_indices = vmm_aux2; |
| 932 | + TRegS vmm_tmp = vmm_aux3; // this is for immediate read after write |
| 933 | + |
| 934 | + auto gather_coefficient |
| 935 | + = [&](TRegS vmm_coeff, int coeff_idx, TRegS vmm_pol_idx) { |
| 936 | + // we actually have 25 polynomials but pad to avoid unaligned accesses/ |
| 937 | + int gelu_erf_n_polynomials = 32; |
| 938 | + h->add_imm(h->X_TMP_1, x_table, |
| 939 | + table_off(gelu_erf_minimax_pol, |
| 940 | + coeff_idx * gelu_erf_n_polynomials), |
| 941 | + h->X_TMP_0); |
| 942 | + h->ld1w(ZRegS(IDX(vmm_coeff)), p_all / T_z, |
| 943 | + ptr(h->X_TMP_1, ZRegS(IDX(vmm_pol_idx)), SXTW)); |
| 944 | + }; |
| 945 | + |
| 946 | + // we use the erf function symmetry erf(-x) = -erf(x) |
| 947 | + // So we make x positive, we will reapply the sign after erf evaluation |
| 948 | + h->fabs(vmm_src_pos, p_all / T_z, vmm_src); |
| 949 | + |
| 950 | + // Compute indices for table lookup |
| 951 | + h->add(vmm_indices, vmm_src_pos, |
| 952 | + ZRegS(IDX(table_val(gelu_erf_idx_bias, z_tmp, 0)))); |
| 953 | + |
| 954 | + // An arithmetic shift is needed to properly map denormals to |
| 955 | + // their polynomial. we shift by 21 as we use 2 bits of mantissa |
| 956 | + // for indexing. |
| 957 | + h->asr(ZRegS(IDX(vmm_indices)), ZRegS(IDX(vmm_indices)), 21); |
| 958 | + |
| 959 | + // Apply special rules |
| 960 | + h->smax(vmm_indices, p_all / T_z, |
| 961 | + ZRegS(IDX(table_val(gelu_erf_one, z_tmp)))); |
| 962 | + h->smin(vmm_indices, p_all / T_z, |
| 963 | + ZRegS(IDX(table_val(gelu_erf_twenty_four, z_tmp)))); |
| 964 | + |
| 965 | + // We have to check |
| 966 | + // index = x_pos > rbound ? 23 : index; |
| 967 | + // for erf to return -1/1 when we should. |
| 968 | + h->fcmlt(p_mask.s, p_all / T_z, vmm_src_pos, |
| 969 | + ZRegS(IDX(table_val(gelu_erf_rbound, z_tmp)))); |
| 970 | + h->sel(vmm_indices, p_mask, vmm_indices, |
| 971 | + ZRegS(IDX(table_val(gelu_erf_twenty_three, z_tmp)))); |
| 972 | + |
| 973 | + // Adjusting indices |
| 974 | + h->mul(ZRegS(IDX(vmm_indices)), sizeof(float)); |
| 975 | + |
| 976 | + // Evaluate the polynomial |
| 977 | + gather_coefficient(vmm_pol, 5, vmm_indices); |
| 978 | + for (int deg = 4; deg >= 0; --deg) { |
| 979 | + gather_coefficient(vmm_tmp, deg, vmm_indices); |
| 980 | + h->fmad(vmm_pol, p_all / T_z, vmm_src_pos, vmm_tmp); |
| 981 | + } |
921 | 982 |
|
| 983 | + // Set the sign of vmm_pol properly |
| 984 | + h->mov(ZRegD(IDX(vmm_tmp)), ZRegD(IDX(vmm_src))); |
| 985 | + h->and_(ZRegD(IDX(vmm_tmp)), ZRegD(IDX(vmm_tmp)), |
| 986 | + ZRegD(IDX(table_val(sign_mask, z_tmp)))); |
| 987 | + h->eor(ZRegD(IDX(vmm_pol)), p_all / T_z, ZRegD(IDX(vmm_tmp))); |
| 988 | + |
| 989 | + // Compute the final output |
| 990 | + h->fadd(vmm_pol, vmm_pol, ZRegS(IDX(table_val(one, z_tmp)))); |
| 991 | + h->fmul(vmm_src, p_all / T_z, vmm_pol); |
| 992 | + h->fmul(vmm_src, vmm_src, ZRegS(IDX(table_val(half, z_tmp)))); |
| 993 | +} |
922 | 994 | template <cpu_isa_t isa>
|
923 | 995 | void jit_uni_eltwise_injector_f32<isa>::gelu_erf_compute_vector_fwd(
|
924 | 996 | const TRegS &vmm_src) {
|
| 997 | + |
| 998 | + if (isa == sve_512) { // TODO: consider performance improvement for lower ISA |
| 999 | + gelu_erf_minimax_approx_compute_vector_fwd(vmm_src); |
| 1000 | + return; |
| 1001 | + } |
925 | 1002 | // Here we approximate erf(x) using the expression by
|
926 | 1003 | // Abramowitz and Stegun from ``Handbook of Mathematical
|
927 | 1004 | // Functions''
|
@@ -1703,6 +1780,215 @@ void jit_uni_eltwise_injector_f32<isa>::register_table_entries() {
|
1703 | 1780 | {gelu_erf_pol, {0xbfba00e3, true}}, // p4 = -1.453152027f
|
1704 | 1781 | {gelu_erf_pol, {0x3f87dc22, true}}, // p5 = 1.061405429f
|
1705 | 1782 | };
|
| 1783 | + // gelu_erf(x) constants for direct erf approximation (formula defined) |
| 1784 | + static const table_t gelu_erf_minimax_consts { |
| 1785 | + {gelu_erf_idx_bias, {0xc21fffff, true}}, |
| 1786 | + {gelu_erf_rbound, {0x40b15cee, true}}, |
| 1787 | + {gelu_erf_one, {0x00000001, true}}, |
| 1788 | + {gelu_erf_twenty_three, {0x00000017, true}}, |
| 1789 | + {gelu_erf_twenty_four, {0x00000018, true}}, |
| 1790 | + }; |
| 1791 | + // gelu_erf(x) minimax polynomials for piecewise approximaxtion |
| 1792 | + static const table_t gelu_erf_minimax_polynomial { |
| 1793 | + // coefficients of degree 0 |
| 1794 | + {gelu_erf_minimax_pol, {0xa6f2cb94, false}}, // -0x1.e59728p-50 |
| 1795 | + {gelu_erf_minimax_pol, {0x32827792, false}}, // 0x1.04ef24p-26 |
| 1796 | + {gelu_erf_minimax_pol, {0x3381cc0c, false}}, // 0x1.039818p-24 |
| 1797 | + {gelu_erf_minimax_pol, {0x34523d4a, false}}, // 0x1.a47a94p-23 |
| 1798 | + {gelu_erf_minimax_pol, {0x351ac44d, false}}, // 0x1.35889ap-21 |
| 1799 | + {gelu_erf_minimax_pol, {0x35f36d88, false}}, // 0x1.e6db1p-20 |
| 1800 | + {gelu_erf_minimax_pol, {0x36ee8229, false}}, // 0x1.dd0452p-18 |
| 1801 | + {gelu_erf_minimax_pol, {0x37b8a3bb, false}}, // 0x1.714776p-16 |
| 1802 | + {gelu_erf_minimax_pol, {0x3867a213, false}}, // 0x1.cf4426p-15 |
| 1803 | + {gelu_erf_minimax_pol, {0x3940033b, false}}, // 0x1.800676p-13 |
| 1804 | + {gelu_erf_minimax_pol, {0x3a2a5a1d, false}}, // 0x1.54b43ap-11 |
| 1805 | + {gelu_erf_minimax_pol, {0x3ae35863, false}}, // 0x1.c6b0c6p-10 |
| 1806 | + {gelu_erf_minimax_pol, {0x3b7828f2, false}}, // 0x1.f051e4p-9 |
| 1807 | + {gelu_erf_minimax_pol, {0x3c08b14b, false}}, // 0x1.116296p-7 |
| 1808 | + {gelu_erf_minimax_pol, {0x3c515ed3, false}}, // 0x1.a2bda6p-7 |
| 1809 | + {gelu_erf_minimax_pol, {0xbb503236, false}}, // -0x1.a0646cp-9 |
| 1810 | + {gelu_erf_minimax_pol, {0xbd8d8e5e, false}}, // -0x1.1b1cbcp-4 |
| 1811 | + {gelu_erf_minimax_pol, {0xbe8abcd9, false}}, // -0x1.1579b2p-2 |
| 1812 | + {gelu_erf_minimax_pol, {0xbf0c19a2, false}}, // -0x1.183344p-1 |
| 1813 | + {gelu_erf_minimax_pol, {0xbeccb328, false}}, // -0x1.99665p-2 |
| 1814 | + {gelu_erf_minimax_pol, {0x3e176ced, false}}, // 0x1.2ed9dap-3 |
| 1815 | + {gelu_erf_minimax_pol, {0x3f470d99, false}}, // 0x1.8e1b32p-1 |
| 1816 | + {gelu_erf_minimax_pol, {0x3f7abb28, false}}, // 0x1.f5765p-1 |
| 1817 | + {gelu_erf_minimax_pol, {0x3f800000, false}}, // 0x1p0 |
| 1818 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 |
| 1819 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1820 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1821 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1822 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1823 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1824 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1825 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1826 | + // coefficients of degree 1 |
| 1827 | + {gelu_erf_minimax_pol, {0x3f4c422a, false}}, // 0x1.988454p-1 |
| 1828 | + {gelu_erf_minimax_pol, {0x3f4c421f, false}}, // 0x1.98843ep-1 |
| 1829 | + {gelu_erf_minimax_pol, {0x3f4c4207, false}}, // 0x1.98840ep-1 |
| 1830 | + {gelu_erf_minimax_pol, {0x3f4c41cb, false}}, // 0x1.988396p-1 |
| 1831 | + {gelu_erf_minimax_pol, {0x3f4c413b, false}}, // 0x1.988276p-1 |
| 1832 | + {gelu_erf_minimax_pol, {0x3f4c3fad, false}}, // 0x1.987f5ap-1 |
| 1833 | + {gelu_erf_minimax_pol, {0x3f4c3a2f, false}}, // 0x1.98745ep-1 |
| 1834 | + {gelu_erf_minimax_pol, {0x3f4c2d40, false}}, // 0x1.985a8p-1 |
| 1835 | + {gelu_erf_minimax_pol, {0x3f4c146a, false}}, // 0x1.9828d4p-1 |
| 1836 | + {gelu_erf_minimax_pol, {0x3f4bc341, false}}, // 0x1.978682p-1 |
| 1837 | + {gelu_erf_minimax_pol, {0x3f4ad08c, false}}, // 0x1.95a118p-1 |
| 1838 | + {gelu_erf_minimax_pol, {0x3f48f8cf, false}}, // 0x1.91f19ep-1 |
| 1839 | + {gelu_erf_minimax_pol, {0x3f45fac7, false}}, // 0x1.8bf58ep-1 |
| 1840 | + {gelu_erf_minimax_pol, {0x3f404e07, false}}, // 0x1.809c0ep-1 |
| 1841 | + {gelu_erf_minimax_pol, {0x3f3b980f, false}}, // 0x1.77301ep-1 |
| 1842 | + {gelu_erf_minimax_pol, {0x3f48dff3, false}}, // 0x1.91bfe6p-1 |
| 1843 | + {gelu_erf_minimax_pol, {0x3f78b21b, false}}, // 0x1.f16436p-1 |
| 1844 | + {gelu_erf_minimax_pol, {0x3fbb0704, false}}, // 0x1.760e08p0 |
| 1845 | + {gelu_erf_minimax_pol, {0x40019c32, false}}, // 0x1.033864p1 |
| 1846 | + {gelu_erf_minimax_pol, {0x3fe536d6, false}}, // 0x1.ca6dacp0 |
| 1847 | + {gelu_erf_minimax_pol, {0x3f81331e, false}}, // 0x1.02663cp0 |
| 1848 | + {gelu_erf_minimax_pol, {0x3e6c8684, false}}, // 0x1.d90d08p-3 |
| 1849 | + {gelu_erf_minimax_pol, {0x3c98f936, false}}, // 0x1.31f26cp-6 |
| 1850 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 |
| 1851 | + {gelu_erf_minimax_pol, {0x3f800000, false}}, // 0x1p0 |
| 1852 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1853 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1854 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1855 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1856 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1857 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1858 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1859 | + // coefficients of degree 2 |
| 1860 | + {gelu_erf_minimax_pol, {0xb62173f4, false}}, // -0x1.42e7e8p-19 |
| 1861 | + {gelu_erf_minimax_pol, {0x3735e4cf, false}}, // 0x1.6bc99ep-17 |
| 1862 | + {gelu_erf_minimax_pol, {0x37f2ff89, false}}, // 0x1.e5ff12p-16 |
| 1863 | + {gelu_erf_minimax_pol, {0x388c23be, false}}, // 0x1.18477cp-14 |
| 1864 | + {gelu_erf_minimax_pol, {0x3917535c, false}}, // 0x1.2ea6b8p-13 |
| 1865 | + {gelu_erf_minimax_pol, {0x39ab2ab0, false}}, // 0x1.56556p-12 |
| 1866 | + {gelu_erf_minimax_pol, {0x3a60fadb, false}}, // 0x1.c1f5b6p-11 |
| 1867 | + {gelu_erf_minimax_pol, {0x3af9b960, false}}, // 0x1.f372cp-10 |
| 1868 | + {gelu_erf_minimax_pol, {0x3b6e5491, false}}, // 0x1.dca922p-9 |
| 1869 | + {gelu_erf_minimax_pol, {0x3c0a4ec5, false}}, // 0x1.149d8ap-7 |
| 1870 | + {gelu_erf_minimax_pol, {0x3ca5aa8c, false}}, // 0x1.4b5518p-6 |
| 1871 | + {gelu_erf_minimax_pol, {0x3d2138d9, false}}, // 0x1.4271b2p-5 |
| 1872 | + {gelu_erf_minimax_pol, {0x3d8737d4, false}}, // 0x1.0e6fa8p-4 |
| 1873 | + {gelu_erf_minimax_pol, {0x3ddfb660, false}}, // 0x1.bf6ccp-4 |
| 1874 | + {gelu_erf_minimax_pol, {0x3e0f27ab, false}}, // 0x1.1e4f56p-3 |
| 1875 | + {gelu_erf_minimax_pol, {0x3d94004b, false}}, // 0x1.280096p-4 |
| 1876 | + {gelu_erf_minimax_pol, {0xbe0efdeb, false}}, // -0x1.1dfbd6p-3 |
| 1877 | + {gelu_erf_minimax_pol, {0xbf1d96c3, false}}, // -0x1.3b2d86p-1 |
| 1878 | + {gelu_erf_minimax_pol, {0xbf89db58, false}}, // -0x1.13b6bp0 |
| 1879 | + {gelu_erf_minimax_pol, {0xbf6d9897, false}}, // -0x1.db312ep-1 |
| 1880 | + {gelu_erf_minimax_pol, {0xbef69fb8, false}}, // -0x1.ed3f7p-2 |
| 1881 | + {gelu_erf_minimax_pol, {0xbdc4f8a8, false}}, // -0x1.89f15p-4 |
| 1882 | + {gelu_erf_minimax_pol, {0xbbde6422, false}}, // -0x1.bcc844p-8 |
| 1883 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 |
| 1884 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 |
| 1885 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1886 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1887 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1888 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1889 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1890 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1891 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1892 | + // coefficients of degree 3 |
| 1893 | + {gelu_erf_minimax_pol, {0xbe081a19, false}}, // -0x1.103432p-3 |
| 1894 | + {gelu_erf_minimax_pol, {0xbe084570, false}}, // -0x1.108aep-3 |
| 1895 | + {gelu_erf_minimax_pol, {0xbe08639b, false}}, // -0x1.10c736p-3 |
| 1896 | + {gelu_erf_minimax_pol, {0xbe089837, false}}, // -0x1.11306ep-3 |
| 1897 | + {gelu_erf_minimax_pol, {0xbe08f409, false}}, // -0x1.11e812p-3 |
| 1898 | + {gelu_erf_minimax_pol, {0xbe09ab95, false}}, // -0x1.13572ap-3 |
| 1899 | + {gelu_erf_minimax_pol, {0xbe0b66d0, false}}, // -0x1.16cdap-3 |
| 1900 | + {gelu_erf_minimax_pol, {0xbe0e400a, false}}, // -0x1.1c8014p-3 |
| 1901 | + {gelu_erf_minimax_pol, {0xbe124df8, false}}, // -0x1.249bfp-3 |
| 1902 | + {gelu_erf_minimax_pol, {0xbe1bde02, false}}, // -0x1.37bc04p-3 |
| 1903 | + {gelu_erf_minimax_pol, {0xbe2f19c9, false}}, // -0x1.5e3392p-3 |
| 1904 | + {gelu_erf_minimax_pol, {0xbe4931bf, false}}, // -0x1.92637ep-3 |
| 1905 | + {gelu_erf_minimax_pol, {0xbe685fbc, false}}, // -0x1.d0bf78p-3 |
| 1906 | + {gelu_erf_minimax_pol, {0xbe89c95f, false}}, // -0x1.1392bep-2 |
| 1907 | + {gelu_erf_minimax_pol, {0xbe96cbca, false}}, // -0x1.2d9794p-2 |
| 1908 | + {gelu_erf_minimax_pol, {0xbe8044aa, false}}, // -0x1.008954p-2 |
| 1909 | + {gelu_erf_minimax_pol, {0xbe0550f2, false}}, // -0x1.0aa1e4p-3 |
| 1910 | + {gelu_erf_minimax_pol, {0x3dcfd6a1, false}}, // 0x1.9fad42p-4 |
| 1911 | + {gelu_erf_minimax_pol, {0x3e94c826, false}}, // 0x1.29904cp-2 |
| 1912 | + {gelu_erf_minimax_pol, {0x3e79345f, false}}, // 0x1.f268bep-3 |
| 1913 | + {gelu_erf_minimax_pol, {0x3decec91, false}}, // 0x1.d9d922p-4 |
| 1914 | + {gelu_erf_minimax_pol, {0x3ca46568, false}}, // 0x1.48cadp-6 |
| 1915 | + {gelu_erf_minimax_pol, {0x3aa1e00a, false}}, // 0x1.43c014p-10 |
| 1916 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 |
| 1917 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 |
| 1918 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1919 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1920 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1921 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1922 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1923 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1924 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1925 | + // coefficients of degree 4 |
| 1926 | + {gelu_erf_minimax_pol, {0xba3d61db, false}}, // -0x1.7ac3b6p-11 |
| 1927 | + {gelu_erf_minimax_pol, {0x39f097a3, false}}, // 0x1.e12f46p-12 |
| 1928 | + {gelu_erf_minimax_pol, {0x3a5845dc, false}}, // 0x1.b08bb8p-11 |
| 1929 | + {gelu_erf_minimax_pol, {0x3ab1fa35, false}}, // 0x1.63f46ap-10 |
| 1930 | + {gelu_erf_minimax_pol, {0x3b0cefb8, false}}, // 0x1.19df7p-9 |
| 1931 | + {gelu_erf_minimax_pol, {0x3b653ab6, false}}, // 0x1.ca756cp-9 |
| 1932 | + {gelu_erf_minimax_pol, {0x3bcae527, false}}, // 0x1.95ca4ep-8 |
| 1933 | + {gelu_erf_minimax_pol, {0x3c221712, false}}, // 0x1.442e24p-7 |
| 1934 | + {gelu_erf_minimax_pol, {0x3c6c5840, false}}, // 0x1.d8b08p-7 |
| 1935 | + {gelu_erf_minimax_pol, {0x3cc0a703, false}}, // 0x1.814e06p-6 |
| 1936 | + {gelu_erf_minimax_pol, {0x3d1dcc19, false}}, // 0x1.3b9832p-5 |
| 1937 | + {gelu_erf_minimax_pol, {0x3d63656d, false}}, // 0x1.c6cadap-5 |
| 1938 | + {gelu_erf_minimax_pol, {0x3d955907, false}}, // 0x1.2ab20ep-4 |
| 1939 | + {gelu_erf_minimax_pol, {0x3dbf9910, false}}, // 0x1.7f322p-4 |
| 1940 | + {gelu_erf_minimax_pol, {0x3dd53f69, false}}, // 0x1.aa7ed2p-4 |
| 1941 | + {gelu_erf_minimax_pol, {0x3db7dcef, false}}, // 0x1.6fb9dep-4 |
| 1942 | + {gelu_erf_minimax_pol, {0x3d639ebe, false}}, // 0x1.c73d7cp-5 |
| 1943 | + {gelu_erf_minimax_pol, {0xba6ede48, false}}, // -0x1.ddbc9p-11 |
| 1944 | + {gelu_erf_minimax_pol, {0xbd22be69, false}}, // -0x1.457cd2p-5 |
| 1945 | + {gelu_erf_minimax_pol, {0xbd041cf1, false}}, // -0x1.0839e2p-5 |
| 1946 | + {gelu_erf_minimax_pol, {0xbc64f5ab, false}}, // -0x1.c9eb56p-7 |
| 1947 | + {gelu_erf_minimax_pol, {0xbb097a32, false}}, // -0x1.12f464p-9 |
| 1948 | + {gelu_erf_minimax_pol, {0xb8ebf380, false}}, // -0x1.d7e7p-14 |
| 1949 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 |
| 1950 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 |
| 1951 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1952 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1953 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1954 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1955 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1956 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1957 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1958 | + // coefficients of degree 5 |
| 1959 | + {gelu_erf_minimax_pol, {0x3cb7d80c, false}}, // 0x1.6fb018p-6 |
| 1960 | + {gelu_erf_minimax_pol, {0x3c9b6050, false}}, // 0x1.36c0ap-6 |
| 1961 | + {gelu_erf_minimax_pol, {0x3c978d11, false}}, // 0x1.2f1a22p-6 |
| 1962 | + {gelu_erf_minimax_pol, {0x3c92e850, false}}, // 0x1.25d0ap-6 |
| 1963 | + {gelu_erf_minimax_pol, {0x3c8d058b, false}}, // 0x1.1a0b16p-6 |
| 1964 | + {gelu_erf_minimax_pol, {0x3c848454, false}}, // 0x1.0908a8p-6 |
| 1965 | + {gelu_erf_minimax_pol, {0x3c6cd623, false}}, // 0x1.d9ac46p-7 |
| 1966 | + {gelu_erf_minimax_pol, {0x3c4c824b, false}}, // 0x1.990496p-7 |
| 1967 | + {gelu_erf_minimax_pol, {0x3c2a7935, false}}, // 0x1.54f26ap-7 |
| 1968 | + {gelu_erf_minimax_pol, {0x3be0b390, false}}, // 0x1.c1672p-8 |
| 1969 | + {gelu_erf_minimax_pol, {0x3b0651ac, false}}, // 0x1.0ca358p-9 |
| 1970 | + {gelu_erf_minimax_pol, {0xbb232f53, false}}, // -0x1.465ea6p-9 |
| 1971 | + {gelu_erf_minimax_pol, {0xbbd42fa0, false}}, // -0x1.a85f4p-8 |
| 1972 | + {gelu_erf_minimax_pol, {0xbc2c5366, false}}, // -0x1.58a6ccp-7 |
| 1973 | + {gelu_erf_minimax_pol, {0xbc492c9e, false}}, // -0x1.92593cp-7 |
| 1974 | + {gelu_erf_minimax_pol, {0xbc2a7aa6, false}}, // -0x1.54f54cp-7 |
| 1975 | + {gelu_erf_minimax_pol, {0xbbd55d04, false}}, // -0x1.aaba08p-8 |
| 1976 | + {gelu_erf_minimax_pol, {0xba823a76, false}}, // -0x1.0474ecp-10 |
| 1977 | + {gelu_erf_minimax_pol, {0x3b102aa8, false}}, // 0x1.20555p-9 |
| 1978 | + {gelu_erf_minimax_pol, {0x3ae25a7e, false}}, // 0x1.c4b4fcp-10 |
| 1979 | + {gelu_erf_minimax_pol, {0x3a31f792, false}}, // 0x1.63ef24p-11 |
| 1980 | + {gelu_erf_minimax_pol, {0x38b84375, false}}, // 0x1.7086eap-14 |
| 1981 | + {gelu_erf_minimax_pol, {0x3689bb5a, false}}, // 0x1.1376b4p-18 |
| 1982 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 |
| 1983 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 |
| 1984 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1985 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1986 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1987 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1988 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1989 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1990 | + {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd |
| 1991 | + }; |
1706 | 1992 |
|
1707 | 1993 | // This object takes care about which constants and polynomials to include.
|
1708 | 1994 | struct need_t {
|
@@ -1780,7 +2066,8 @@ void jit_uni_eltwise_injector_f32<isa>::register_table_entries() {
|
1780 | 2066 | if (need.gelu_tanh()) push_entries_of(gelu_tanh_consts);
|
1781 | 2067 | if (need.gelu_erf()) push_entries_of(gelu_erf_consts);
|
1782 | 2068 | if (need.gelu_erf()) push_entries_of(gelu_erf_polynomial);
|
1783 |
| - |
| 2069 | + if (need.gelu_erf()) push_entries_of(gelu_erf_minimax_consts); |
| 2070 | + if (need.gelu_erf()) push_entries_of(gelu_erf_minimax_polynomial); |
1784 | 2071 | // Now that we registered the entries, we set the offsets. No
|
1785 | 2072 | // entries should be registered after this point. This allows to
|
1786 | 2073 | // expect the same order when injecting the table entries in
|
|
0 commit comments