i3 my balls
This commit is contained in:
126
i3/.config/i3/config
Normal file
126
i3/.config/i3/config
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
set $mod Mod4
|
||||||
|
font pango:monospace 8
|
||||||
|
|
||||||
|
exec --no-startup-id dex-autostart --autostart --environment i3
|
||||||
|
exec --no-startup-id xss-lock --transfer-sleep-lock -- i3lock --nofork
|
||||||
|
exec --no-startup-id nm-applet
|
||||||
|
|
||||||
|
# Use pactl to adjust volume in PulseAudio.
|
||||||
|
set $refresh_i3status killall -SIGUSR1 i3status
|
||||||
|
bindsym XF86AudioRaiseVolume exec --no-startup-id pactl set-sink-volume @DEFAULT_SINK@ +10% && $refresh_i3status
|
||||||
|
bindsym XF86AudioLowerVolume exec --no-startup-id pactl set-sink-volume @DEFAULT_SINK@ -10% && $refresh_i3status
|
||||||
|
bindsym XF86AudioMute exec --no-startup-id pactl set-sink-mute @DEFAULT_SINK@ toggle && $refresh_i3status
|
||||||
|
bindsym XF86AudioMicMute exec --no-startup-id pactl set-source-mute @DEFAULT_SOURCE@ toggle && $refresh_i3status
|
||||||
|
|
||||||
|
# Use Mouse+$mod to drag floating windows to their wanted position
|
||||||
|
floating_modifier $mod
|
||||||
|
|
||||||
|
# move tiling windows via drag & drop by left-clicking into the title bar,
|
||||||
|
# or left-clicking anywhere into the window while holding the floating modifier.
|
||||||
|
tiling_drag modifier titlebar
|
||||||
|
|
||||||
|
# start a terminal
|
||||||
|
bindsym $mod+Return exec wezterm
|
||||||
|
|
||||||
|
# kill focused window
|
||||||
|
bindsym $mod+w kill
|
||||||
|
|
||||||
|
# start dmenu (a program launcher)
|
||||||
|
bindsym $mod+space exec --no-startup-id dmenu_run
|
||||||
|
|
||||||
|
# change focus
|
||||||
|
bindsym $mod+h focus left
|
||||||
|
bindsym $mod+j focus down
|
||||||
|
bindsym $mod+k focus up
|
||||||
|
bindsym $mod+l focus right
|
||||||
|
|
||||||
|
# move focused window
|
||||||
|
bindsym $mod+Shift+h move left
|
||||||
|
bindsym $mod+Shift+j move down
|
||||||
|
bindsym $mod+Shift+k move up
|
||||||
|
bindsym $mod+Shift+l move right
|
||||||
|
|
||||||
|
# enter fullscreen mode for the focused container
|
||||||
|
bindsym $mod+f fullscreen toggle
|
||||||
|
|
||||||
|
bindsym $mod+v floating toggle
|
||||||
|
|
||||||
|
# Define names for default workspaces for which we configure key bindings later on.
|
||||||
|
# We use variables to avoid repeating the names in multiple places.
|
||||||
|
set $ws1 "1"
|
||||||
|
set $ws2 "2"
|
||||||
|
set $ws3 "3"
|
||||||
|
set $ws4 "4"
|
||||||
|
set $ws5 "5"
|
||||||
|
set $ws6 "6"
|
||||||
|
set $ws7 "7"
|
||||||
|
set $ws8 "8"
|
||||||
|
set $ws9 "9"
|
||||||
|
set $ws10 "10"
|
||||||
|
|
||||||
|
# switch to workspace
|
||||||
|
bindsym $mod+1 workspace number $ws1
|
||||||
|
bindsym $mod+2 workspace number $ws2
|
||||||
|
bindsym $mod+3 workspace number $ws3
|
||||||
|
bindsym $mod+4 workspace number $ws4
|
||||||
|
bindsym $mod+5 workspace number $ws5
|
||||||
|
bindsym $mod+6 workspace number $ws6
|
||||||
|
bindsym $mod+7 workspace number $ws7
|
||||||
|
bindsym $mod+8 workspace number $ws8
|
||||||
|
bindsym $mod+9 workspace number $ws9
|
||||||
|
bindsym $mod+0 workspace number $ws10
|
||||||
|
|
||||||
|
# move focused container to workspace
|
||||||
|
bindsym $mod+Shift+1 move container to workspace number $ws1
|
||||||
|
bindsym $mod+Shift+2 move container to workspace number $ws2
|
||||||
|
bindsym $mod+Shift+3 move container to workspace number $ws3
|
||||||
|
bindsym $mod+Shift+4 move container to workspace number $ws4
|
||||||
|
bindsym $mod+Shift+5 move container to workspace number $ws5
|
||||||
|
bindsym $mod+Shift+6 move container to workspace number $ws6
|
||||||
|
bindsym $mod+Shift+7 move container to workspace number $ws7
|
||||||
|
bindsym $mod+Shift+8 move container to workspace number $ws8
|
||||||
|
bindsym $mod+Shift+9 move container to workspace number $ws9
|
||||||
|
bindsym $mod+Shift+0 move container to workspace number $ws10
|
||||||
|
|
||||||
|
# restart i3 inplace (preserves your layout/session, can be used to upgrade i3)
|
||||||
|
bindsym $mod+Shift+r restart
|
||||||
|
# exit i3 (logs you out of your X session)
|
||||||
|
bindsym $mod+Shift+e exec "i3-nagbar -t warning -m 'You pressed the exit shortcut. Do you really want to exit i3? This will end your X session.' -B 'Yes, exit i3' 'i3-msg exit'"
|
||||||
|
|
||||||
|
# resize window (you can also use the mouse for that)
|
||||||
|
mode "resize" {
|
||||||
|
# These bindings trigger as soon as you enter the resize mode
|
||||||
|
|
||||||
|
# Pressing left will shrink the window’s width.
|
||||||
|
# Pressing right will grow the window’s width.
|
||||||
|
# Pressing up will shrink the window’s height.
|
||||||
|
# Pressing down will grow the window’s height.
|
||||||
|
bindsym h resize shrink width 10 px or 10 ppt
|
||||||
|
bindsym j resize grow height 10 px or 10 ppt
|
||||||
|
bindsym k resize shrink height 10 px or 10 ppt
|
||||||
|
bindsym l resize grow width 10 px or 10 ppt
|
||||||
|
|
||||||
|
# back to normal: Enter or Escape or $mod+r
|
||||||
|
bindsym Return mode "default"
|
||||||
|
bindsym Escape mode "default"
|
||||||
|
bindsym $mod+r mode "default"
|
||||||
|
}
|
||||||
|
|
||||||
|
bindsym $mod+r mode "resize"
|
||||||
|
|
||||||
|
# Start i3bar to display a workspace bar (plus the system information i3status
|
||||||
|
# finds out, if available)
|
||||||
|
bar {
|
||||||
|
status_command i3status
|
||||||
|
}
|
||||||
|
|
||||||
|
# external programs
|
||||||
|
bindsym $mod+d exec flatpak run dev.vencord.Vesktop
|
||||||
|
bindsym $mod+b exec firefox
|
||||||
|
bindsym $mod+s exec flatpak run org.vinegarhq.Sober
|
||||||
|
bindsym $mod+g exec steam
|
||||||
|
bindsym $mod+m exec flatpak run org.prismlauncher.PrismLauncher
|
||||||
|
|
||||||
|
bindsym $mod+Shift+f exec nautilus
|
||||||
|
|
||||||
|
bindsym --release $mod+Shift+s exec sh -c "scrot -s /tmp/screenshot.png && xclip -selection clipboard -t image/png -i /tmp/screenshot.png && cp /tmp/screenshot.png ~/latest.png"
|
||||||
61
i3/.config/i3status/config
Normal file
61
i3/.config/i3status/config
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# i3status configuration file.
|
||||||
|
# see "man i3status" for documentation.
|
||||||
|
|
||||||
|
# It is important that this file is edited as UTF-8.
|
||||||
|
# The following line should contain a sharp s:
|
||||||
|
# ß
|
||||||
|
# If the above line is not correctly displayed, fix your editor first!
|
||||||
|
|
||||||
|
general {
|
||||||
|
colors = true
|
||||||
|
interval = 5
|
||||||
|
}
|
||||||
|
|
||||||
|
order += "ipv6"
|
||||||
|
order += "cpu_temperature 0"
|
||||||
|
order += "disk /"
|
||||||
|
order += "wireless _first_"
|
||||||
|
order += "ethernet _first_"
|
||||||
|
# order += "battery all"
|
||||||
|
order += "load"
|
||||||
|
order += "tztime local"
|
||||||
|
|
||||||
|
cpu_temperature 0 {
|
||||||
|
format = "Tea: %degrees °C"
|
||||||
|
path = "/sys/class/hwmon/hwmon1/temp1_input"
|
||||||
|
max_threshold = 80000
|
||||||
|
}
|
||||||
|
|
||||||
|
wireless _first_ {
|
||||||
|
# format_up = "W: (%quality at %essid) %ip"
|
||||||
|
format_up = "W: (%quality) Leaked IP: %ip"
|
||||||
|
format_down = "W: down"
|
||||||
|
}
|
||||||
|
|
||||||
|
ethernet _first_ {
|
||||||
|
# if you use %speed, i3status requires root privileges
|
||||||
|
# format_up = "E: %ip (%speed)"
|
||||||
|
format_up = "E: Leaked IP: %ip (%speed)"
|
||||||
|
format_down = "E: down"
|
||||||
|
}
|
||||||
|
|
||||||
|
battery all {
|
||||||
|
format = "Fairy Dust: %percentage %status %remaining"
|
||||||
|
}
|
||||||
|
|
||||||
|
tztime local {
|
||||||
|
format = "%d %H:%M:%S"
|
||||||
|
}
|
||||||
|
|
||||||
|
load {
|
||||||
|
format = "Hot Loads: %1min"
|
||||||
|
}
|
||||||
|
|
||||||
|
disk "/" {
|
||||||
|
format = "Penger Folder: %avail"
|
||||||
|
}
|
||||||
|
|
||||||
|
ipv6 {
|
||||||
|
format_up = "Useless Protocol: %ipv6"
|
||||||
|
format_down = "Useless Protocol: Down"
|
||||||
|
}
|
||||||
@@ -1,16 +1,6 @@
|
|||||||
set -g default-terminal "tmux-256color"
|
unbind C-b
|
||||||
set -ag terminal-overrides ",xterm-256color:RGB"
|
|
||||||
set -g prefix C-s
|
set -g prefix C-s
|
||||||
set -g base-index 1
|
bind C-s send-prefix
|
||||||
set -g renumber-windows on
|
|
||||||
set -g mode-keys vi
|
set -g mode-keys vi
|
||||||
set -g status-position top
|
set -g status-keys vi
|
||||||
set -g status-justify absolute-centre
|
set -s escape-time 0
|
||||||
set -g status-style "bg=default"
|
|
||||||
set -g window-status-current-style "fg=blue bold"
|
|
||||||
set -g status-right ""
|
|
||||||
set -g status-left "#S"
|
|
||||||
|
|
||||||
bind r source-file "~/.config/tmux/tmux.conf"
|
|
||||||
bind b set -g status
|
|
||||||
bind G neww -n "lazygit" lazygit
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ config.window_padding = {
|
|||||||
|
|
||||||
config.scrollback_lines = 1000
|
config.scrollback_lines = 1000
|
||||||
|
|
||||||
config.font = wezterm.font("ComicShannsMono Nerd Font")
|
-- config.font = wezterm.font("ComicShannsMono")
|
||||||
config.font_size = 20.0
|
config.font_size = 20.0
|
||||||
|
|
||||||
config.front_end = "WebGpu"
|
config.front_end = "WebGpu"
|
||||||
Submodule zsh/.local/share/nvim/lazy/cmp-nvim-lsp deleted from bd5a7d6db1
Submodule zsh/.local/share/nvim/lazy/cmp-path deleted from c642487086
Submodule zsh/.local/share/nvim/lazy/gruber-darker.nvim deleted from 98a2e14198
Submodule zsh/.local/share/nvim/lazy/lazy.nvim deleted from 85c7ff3711
Submodule zsh/.local/share/nvim/lazy/mason-lspconfig.nvim deleted from f760507df8
Submodule zsh/.local/share/nvim/lazy/mason.nvim deleted from b3689a41dd
Submodule zsh/.local/share/nvim/lazy/nvim-cmp deleted from 106c4bcc05
Submodule zsh/.local/share/nvim/lazy/nvim-lspconfig deleted from 336b388c27
Submodule zsh/.local/share/nvim/lazy/plenary.nvim deleted from b9fd5226c2
Submodule zsh/.local/share/nvim/lazy/snacks.nvim deleted from 5e0e869852
Submodule zsh/.local/share/nvim/lazy/telescope.nvim deleted from a0bbec2114
Submodule zsh/.local/share/nvim/lazy/vim-fugitive deleted from 61b51c09b7
@@ -1 +0,0 @@
|
|||||||
../packages/clangd/clangd_21.1.0/bin/clangd
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../packages/gopls/gopls
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../packages/lua-language-server/lua-language-server
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../packages/ols/ols-x86_64-unknown-linux-gnu
|
|
||||||
@@ -1,279 +0,0 @@
|
|||||||
==============================================================================
|
|
||||||
The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
|
|
||||||
==============================================================================
|
|
||||||
|
|
||||||
Apache License
|
|
||||||
Version 2.0, January 2004
|
|
||||||
http://www.apache.org/licenses/
|
|
||||||
|
|
||||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
||||||
|
|
||||||
1. Definitions.
|
|
||||||
|
|
||||||
"License" shall mean the terms and conditions for use, reproduction,
|
|
||||||
and distribution as defined by Sections 1 through 9 of this document.
|
|
||||||
|
|
||||||
"Licensor" shall mean the copyright owner or entity authorized by
|
|
||||||
the copyright owner that is granting the License.
|
|
||||||
|
|
||||||
"Legal Entity" shall mean the union of the acting entity and all
|
|
||||||
other entities that control, are controlled by, or are under common
|
|
||||||
control with that entity. For the purposes of this definition,
|
|
||||||
"control" means (i) the power, direct or indirect, to cause the
|
|
||||||
direction or management of such entity, whether by contract or
|
|
||||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
||||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
||||||
|
|
||||||
"You" (or "Your") shall mean an individual or Legal Entity
|
|
||||||
exercising permissions granted by this License.
|
|
||||||
|
|
||||||
"Source" form shall mean the preferred form for making modifications,
|
|
||||||
including but not limited to software source code, documentation
|
|
||||||
source, and configuration files.
|
|
||||||
|
|
||||||
"Object" form shall mean any form resulting from mechanical
|
|
||||||
transformation or translation of a Source form, including but
|
|
||||||
not limited to compiled object code, generated documentation,
|
|
||||||
and conversions to other media types.
|
|
||||||
|
|
||||||
"Work" shall mean the work of authorship, whether in Source or
|
|
||||||
Object form, made available under the License, as indicated by a
|
|
||||||
copyright notice that is included in or attached to the work
|
|
||||||
(an example is provided in the Appendix below).
|
|
||||||
|
|
||||||
"Derivative Works" shall mean any work, whether in Source or Object
|
|
||||||
form, that is based on (or derived from) the Work and for which the
|
|
||||||
editorial revisions, annotations, elaborations, or other modifications
|
|
||||||
represent, as a whole, an original work of authorship. For the purposes
|
|
||||||
of this License, Derivative Works shall not include works that remain
|
|
||||||
separable from, or merely link (or bind by name) to the interfaces of,
|
|
||||||
the Work and Derivative Works thereof.
|
|
||||||
|
|
||||||
"Contribution" shall mean any work of authorship, including
|
|
||||||
the original version of the Work and any modifications or additions
|
|
||||||
to that Work or Derivative Works thereof, that is intentionally
|
|
||||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
||||||
or by an individual or Legal Entity authorized to submit on behalf of
|
|
||||||
the copyright owner. For the purposes of this definition, "submitted"
|
|
||||||
means any form of electronic, verbal, or written communication sent
|
|
||||||
to the Licensor or its representatives, including but not limited to
|
|
||||||
communication on electronic mailing lists, source code control systems,
|
|
||||||
and issue tracking systems that are managed by, or on behalf of, the
|
|
||||||
Licensor for the purpose of discussing and improving the Work, but
|
|
||||||
excluding communication that is conspicuously marked or otherwise
|
|
||||||
designated in writing by the copyright owner as "Not a Contribution."
|
|
||||||
|
|
||||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
||||||
on behalf of whom a Contribution has been received by Licensor and
|
|
||||||
subsequently incorporated within the Work.
|
|
||||||
|
|
||||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
||||||
this License, each Contributor hereby grants to You a perpetual,
|
|
||||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
||||||
copyright license to reproduce, prepare Derivative Works of,
|
|
||||||
publicly display, publicly perform, sublicense, and distribute the
|
|
||||||
Work and such Derivative Works in Source or Object form.
|
|
||||||
|
|
||||||
3. Grant of Patent License. Subject to the terms and conditions of
|
|
||||||
this License, each Contributor hereby grants to You a perpetual,
|
|
||||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
||||||
(except as stated in this section) patent license to make, have made,
|
|
||||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
||||||
where such license applies only to those patent claims licensable
|
|
||||||
by such Contributor that are necessarily infringed by their
|
|
||||||
Contribution(s) alone or by combination of their Contribution(s)
|
|
||||||
with the Work to which such Contribution(s) was submitted. If You
|
|
||||||
institute patent litigation against any entity (including a
|
|
||||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
||||||
or a Contribution incorporated within the Work constitutes direct
|
|
||||||
or contributory patent infringement, then any patent licenses
|
|
||||||
granted to You under this License for that Work shall terminate
|
|
||||||
as of the date such litigation is filed.
|
|
||||||
|
|
||||||
4. Redistribution. You may reproduce and distribute copies of the
|
|
||||||
Work or Derivative Works thereof in any medium, with or without
|
|
||||||
modifications, and in Source or Object form, provided that You
|
|
||||||
meet the following conditions:
|
|
||||||
|
|
||||||
(a) You must give any other recipients of the Work or
|
|
||||||
Derivative Works a copy of this License; and
|
|
||||||
|
|
||||||
(b) You must cause any modified files to carry prominent notices
|
|
||||||
stating that You changed the files; and
|
|
||||||
|
|
||||||
(c) You must retain, in the Source form of any Derivative Works
|
|
||||||
that You distribute, all copyright, patent, trademark, and
|
|
||||||
attribution notices from the Source form of the Work,
|
|
||||||
excluding those notices that do not pertain to any part of
|
|
||||||
the Derivative Works; and
|
|
||||||
|
|
||||||
(d) If the Work includes a "NOTICE" text file as part of its
|
|
||||||
distribution, then any Derivative Works that You distribute must
|
|
||||||
include a readable copy of the attribution notices contained
|
|
||||||
within such NOTICE file, excluding those notices that do not
|
|
||||||
pertain to any part of the Derivative Works, in at least one
|
|
||||||
of the following places: within a NOTICE text file distributed
|
|
||||||
as part of the Derivative Works; within the Source form or
|
|
||||||
documentation, if provided along with the Derivative Works; or,
|
|
||||||
within a display generated by the Derivative Works, if and
|
|
||||||
wherever such third-party notices normally appear. The contents
|
|
||||||
of the NOTICE file are for informational purposes only and
|
|
||||||
do not modify the License. You may add Your own attribution
|
|
||||||
notices within Derivative Works that You distribute, alongside
|
|
||||||
or as an addendum to the NOTICE text from the Work, provided
|
|
||||||
that such additional attribution notices cannot be construed
|
|
||||||
as modifying the License.
|
|
||||||
|
|
||||||
You may add Your own copyright statement to Your modifications and
|
|
||||||
may provide additional or different license terms and conditions
|
|
||||||
for use, reproduction, or distribution of Your modifications, or
|
|
||||||
for any such Derivative Works as a whole, provided Your use,
|
|
||||||
reproduction, and distribution of the Work otherwise complies with
|
|
||||||
the conditions stated in this License.
|
|
||||||
|
|
||||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
||||||
any Contribution intentionally submitted for inclusion in the Work
|
|
||||||
by You to the Licensor shall be under the terms and conditions of
|
|
||||||
this License, without any additional terms or conditions.
|
|
||||||
Notwithstanding the above, nothing herein shall supersede or modify
|
|
||||||
the terms of any separate license agreement you may have executed
|
|
||||||
with Licensor regarding such Contributions.
|
|
||||||
|
|
||||||
6. Trademarks. This License does not grant permission to use the trade
|
|
||||||
names, trademarks, service marks, or product names of the Licensor,
|
|
||||||
except as required for reasonable and customary use in describing the
|
|
||||||
origin of the Work and reproducing the content of the NOTICE file.
|
|
||||||
|
|
||||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
||||||
agreed to in writing, Licensor provides the Work (and each
|
|
||||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
||||||
implied, including, without limitation, any warranties or conditions
|
|
||||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
||||||
appropriateness of using or redistributing the Work and assume any
|
|
||||||
risks associated with Your exercise of permissions under this License.
|
|
||||||
|
|
||||||
8. Limitation of Liability. In no event and under no legal theory,
|
|
||||||
whether in tort (including negligence), contract, or otherwise,
|
|
||||||
unless required by applicable law (such as deliberate and grossly
|
|
||||||
negligent acts) or agreed to in writing, shall any Contributor be
|
|
||||||
liable to You for damages, including any direct, indirect, special,
|
|
||||||
incidental, or consequential damages of any character arising as a
|
|
||||||
result of this License or out of the use or inability to use the
|
|
||||||
Work (including but not limited to damages for loss of goodwill,
|
|
||||||
work stoppage, computer failure or malfunction, or any and all
|
|
||||||
other commercial damages or losses), even if such Contributor
|
|
||||||
has been advised of the possibility of such damages.
|
|
||||||
|
|
||||||
9. Accepting Warranty or Additional Liability. While redistributing
|
|
||||||
the Work or Derivative Works thereof, You may choose to offer,
|
|
||||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
||||||
or other liability obligations and/or rights consistent with this
|
|
||||||
License. However, in accepting such obligations, You may act only
|
|
||||||
on Your own behalf and on Your sole responsibility, not on behalf
|
|
||||||
of any other Contributor, and only if You agree to indemnify,
|
|
||||||
defend, and hold each Contributor harmless for any liability
|
|
||||||
incurred by, or claims asserted against, such Contributor by reason
|
|
||||||
of your accepting any such warranty or additional liability.
|
|
||||||
|
|
||||||
END OF TERMS AND CONDITIONS
|
|
||||||
|
|
||||||
APPENDIX: How to apply the Apache License to your work.
|
|
||||||
|
|
||||||
To apply the Apache License to your work, attach the following
|
|
||||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
|
||||||
replaced with your own identifying information. (Don't include
|
|
||||||
the brackets!) The text should be enclosed in the appropriate
|
|
||||||
comment syntax for the file format. We also recommend that a
|
|
||||||
file or class name and description of purpose be included on the
|
|
||||||
same "printed page" as the copyright notice for easier
|
|
||||||
identification within third-party archives.
|
|
||||||
|
|
||||||
Copyright [yyyy] [name of copyright owner]
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
---- LLVM Exceptions to the Apache 2.0 License ----
|
|
||||||
|
|
||||||
As an exception, if, as a result of your compiling your source code, portions
|
|
||||||
of this Software are embedded into an Object form of such source code, you
|
|
||||||
may redistribute such embedded portions in such Object form without complying
|
|
||||||
with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
|
|
||||||
|
|
||||||
In addition, if you combine or link compiled forms of this Software with
|
|
||||||
software that is licensed under the GPLv2 ("Combined Software") and if a
|
|
||||||
court of competent jurisdiction determines that the patent provision (Section
|
|
||||||
3), the indemnity provision (Section 9) or other Section of the License
|
|
||||||
conflicts with the conditions of the GPLv2, you may retroactively and
|
|
||||||
prospectively choose to deem waived or otherwise exclude such Section(s) of
|
|
||||||
the License, but only in their entirety and only with respect to the Combined
|
|
||||||
Software.
|
|
||||||
|
|
||||||
==============================================================================
|
|
||||||
Software from third parties included in the LLVM Project:
|
|
||||||
==============================================================================
|
|
||||||
The LLVM Project contains third party software which is under different license
|
|
||||||
terms. All such code will be identified clearly using at least one of two
|
|
||||||
mechanisms:
|
|
||||||
1) It will be in a separate directory tree with its own `LICENSE.txt` or
|
|
||||||
`LICENSE` file at the top containing the specific license and restrictions
|
|
||||||
which apply to that software, or
|
|
||||||
2) It will contain specific license and restriction terms at the top of every
|
|
||||||
file.
|
|
||||||
|
|
||||||
==============================================================================
|
|
||||||
Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy):
|
|
||||||
==============================================================================
|
|
||||||
University of Illinois/NCSA
|
|
||||||
Open Source License
|
|
||||||
|
|
||||||
Copyright (c) 2003-2019 University of Illinois at Urbana-Champaign.
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Developed by:
|
|
||||||
|
|
||||||
LLVM Team
|
|
||||||
|
|
||||||
University of Illinois at Urbana-Champaign
|
|
||||||
|
|
||||||
http://llvm.org
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
||||||
this software and associated documentation files (the "Software"), to deal with
|
|
||||||
the Software without restriction, including without limitation the rights to
|
|
||||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
||||||
of the Software, and to permit persons to whom the Software is furnished to do
|
|
||||||
so, subject to the following conditions:
|
|
||||||
|
|
||||||
* Redistributions of source code must retain the above copyright notice,
|
|
||||||
this list of conditions and the following disclaimers.
|
|
||||||
|
|
||||||
* Redistributions in binary form must reproduce the above copyright notice,
|
|
||||||
this list of conditions and the following disclaimers in the
|
|
||||||
documentation and/or other materials provided with the distribution.
|
|
||||||
|
|
||||||
* Neither the names of the LLVM Team, University of Illinois at
|
|
||||||
Urbana-Champaign, nor the names of its contributors may be used to
|
|
||||||
endorse or promote products derived from this Software without specific
|
|
||||||
prior written permission.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
||||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
|
||||||
SOFTWARE.
|
|
||||||
|
|
||||||
Binary file not shown.
@@ -1,121 +0,0 @@
|
|||||||
/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __CUDA_BUILTIN_VARS_H
|
|
||||||
#define __CUDA_BUILTIN_VARS_H
|
|
||||||
|
|
||||||
// Forward declares from vector_types.h.
|
|
||||||
struct uint3;
|
|
||||||
struct dim3;
|
|
||||||
|
|
||||||
// The file implements built-in CUDA variables using __declspec(property).
|
|
||||||
// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
|
|
||||||
// All read accesses of built-in variable fields get converted into calls to a
|
|
||||||
// getter function which in turn calls the appropriate builtin to fetch the
|
|
||||||
// value.
|
|
||||||
//
|
|
||||||
// Example:
|
|
||||||
// int x = threadIdx.x;
|
|
||||||
// IR output:
|
|
||||||
// %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
|
||||||
// PTX output:
|
|
||||||
// mov.u32 %r2, %tid.x;
|
|
||||||
|
|
||||||
#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC) \
|
|
||||||
__declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD; \
|
|
||||||
static inline __attribute__((always_inline)) \
|
|
||||||
__attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) { \
|
|
||||||
return INTRINSIC; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#if __cplusplus >= 201103L
|
|
||||||
#define __DELETE =delete
|
|
||||||
#else
|
|
||||||
#define __DELETE
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Make sure nobody can create instances of the special variable types. nvcc
|
|
||||||
// also disallows taking address of special variables, so we disable address-of
|
|
||||||
// operator as well.
|
|
||||||
#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName) \
|
|
||||||
__attribute__((device)) TypeName() __DELETE; \
|
|
||||||
__attribute__((device)) TypeName(const TypeName &) __DELETE; \
|
|
||||||
__attribute__((device)) void operator=(const TypeName &) const __DELETE; \
|
|
||||||
__attribute__((device)) TypeName *operator&() const __DELETE
|
|
||||||
|
|
||||||
struct __cuda_builtin_threadIdx_t {
|
|
||||||
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
|
|
||||||
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
|
|
||||||
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
|
|
||||||
// threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
|
|
||||||
// uint3). This function is defined after we pull in vector_types.h.
|
|
||||||
__attribute__((device)) operator dim3() const;
|
|
||||||
__attribute__((device)) operator uint3() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct __cuda_builtin_blockIdx_t {
|
|
||||||
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
|
|
||||||
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
|
|
||||||
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
|
|
||||||
// blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
|
|
||||||
// uint3). This function is defined after we pull in vector_types.h.
|
|
||||||
__attribute__((device)) operator dim3() const;
|
|
||||||
__attribute__((device)) operator uint3() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct __cuda_builtin_blockDim_t {
|
|
||||||
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
|
|
||||||
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
|
|
||||||
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
|
|
||||||
// blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
|
|
||||||
// dim3). This function is defined after we pull in vector_types.h.
|
|
||||||
__attribute__((device)) operator dim3() const;
|
|
||||||
__attribute__((device)) operator uint3() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct __cuda_builtin_gridDim_t {
|
|
||||||
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
|
|
||||||
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
|
|
||||||
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
|
|
||||||
// gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
|
|
||||||
// dim3). This function is defined after we pull in vector_types.h.
|
|
||||||
__attribute__((device)) operator dim3() const;
|
|
||||||
__attribute__((device)) operator uint3() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
|
|
||||||
};
|
|
||||||
|
|
||||||
#define __CUDA_BUILTIN_VAR \
|
|
||||||
extern const __attribute__((device)) __attribute__((weak))
|
|
||||||
__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
|
|
||||||
__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
|
|
||||||
__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
|
|
||||||
__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
|
|
||||||
|
|
||||||
// warpSize should translate to read of %WARP_SZ but there's currently no
|
|
||||||
// builtin to do so. According to PTX v4.2 docs 'to date, all target
|
|
||||||
// architectures have a WARP_SZ value of 32'.
|
|
||||||
__attribute__((device)) const int warpSize = 32;
|
|
||||||
|
|
||||||
#undef __CUDA_DEVICE_BUILTIN
|
|
||||||
#undef __CUDA_BUILTIN_VAR
|
|
||||||
#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
|
|
||||||
#undef __DELETE
|
|
||||||
|
|
||||||
#endif /* __CUDA_BUILTIN_VARS_H */
|
|
||||||
@@ -1,512 +0,0 @@
|
|||||||
/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __CLANG_CUDA_CMATH_H__
|
|
||||||
#define __CLANG_CUDA_CMATH_H__
|
|
||||||
#ifndef __CUDA__
|
|
||||||
#error "This file is for CUDA compilation only."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __OPENMP_NVPTX__
|
|
||||||
#include <limits>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// CUDA lets us use various std math functions on the device side. This file
|
|
||||||
// works in concert with __clang_cuda_math_forward_declares.h to make this work.
|
|
||||||
//
|
|
||||||
// Specifically, the forward-declares header declares __device__ overloads for
|
|
||||||
// these functions in the global namespace, then pulls them into namespace std
|
|
||||||
// with 'using' statements. Then this file implements those functions, after
|
|
||||||
// their implementations have been pulled in.
|
|
||||||
//
|
|
||||||
// It's important that we declare the functions in the global namespace and pull
|
|
||||||
// them into namespace std with using statements, as opposed to simply declaring
|
|
||||||
// these functions in namespace std, because our device functions need to
|
|
||||||
// overload the standard library functions, which may be declared in the global
|
|
||||||
// namespace or in std, depending on the degree of conformance of the stdlib
|
|
||||||
// implementation. Declaring in the global namespace and pulling into namespace
|
|
||||||
// std covers all of the known knowns.
|
|
||||||
|
|
||||||
#ifdef __OPENMP_NVPTX__
|
|
||||||
#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
|
|
||||||
#else
|
|
||||||
#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
|
|
||||||
__DEVICE__ long abs(long __n) { return ::labs(__n); }
|
|
||||||
__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
|
|
||||||
__DEVICE__ double abs(double __x) { return ::fabs(__x); }
|
|
||||||
__DEVICE__ float acos(float __x) { return ::acosf(__x); }
|
|
||||||
__DEVICE__ float asin(float __x) { return ::asinf(__x); }
|
|
||||||
__DEVICE__ float atan(float __x) { return ::atanf(__x); }
|
|
||||||
__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
|
|
||||||
__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
|
|
||||||
__DEVICE__ float cos(float __x) { return ::cosf(__x); }
|
|
||||||
__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
|
|
||||||
__DEVICE__ float exp(float __x) { return ::expf(__x); }
|
|
||||||
__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
|
|
||||||
__DEVICE__ float floor(float __x) { return ::floorf(__x); }
|
|
||||||
__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
|
|
||||||
__DEVICE__ int fpclassify(float __x) {
|
|
||||||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
|
||||||
FP_ZERO, __x);
|
|
||||||
}
|
|
||||||
__DEVICE__ int fpclassify(double __x) {
|
|
||||||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
|
||||||
FP_ZERO, __x);
|
|
||||||
}
|
|
||||||
__DEVICE__ float frexp(float __arg, int *__exp) {
|
|
||||||
return ::frexpf(__arg, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
// For inscrutable reasons, the CUDA headers define these functions for us on
|
|
||||||
// Windows.
|
|
||||||
#if !defined(_MSC_VER) || defined(__OPENMP_NVPTX__)
|
|
||||||
|
|
||||||
// For OpenMP we work around some old system headers that have non-conforming
|
|
||||||
// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
|
|
||||||
// this by providing two versions of these functions, differing only in the
|
|
||||||
// return type. To avoid conflicting definitions we disable implicit base
|
|
||||||
// function generation. That means we will end up with two specializations, one
|
|
||||||
// per type, but only one has a base function defined by the system header.
|
|
||||||
#if defined(__OPENMP_NVPTX__)
|
|
||||||
#pragma omp begin declare variant match( \
|
|
||||||
implementation = {extension(disable_implicit_base)})
|
|
||||||
|
|
||||||
// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
|
|
||||||
// add a suffix. This means we would clash with the names of the variants
|
|
||||||
// (note that we do not create implicit base functions here). To avoid
|
|
||||||
// this clash we add a new trait to some of them that is always true
|
|
||||||
// (this is LLVM after all ;)). It will only influence the mangled name
|
|
||||||
// of the variants inside the inner region and avoid the clash.
|
|
||||||
#pragma omp begin declare variant match(implementation = {vendor(llvm)})
|
|
||||||
|
|
||||||
__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
|
|
||||||
__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
|
|
||||||
__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
|
|
||||||
__DEVICE__ int isfinite(double __x) { return ::__isfinited(__x); }
|
|
||||||
__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
|
|
||||||
__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
|
|
||||||
|
|
||||||
#pragma omp end declare variant
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
|
|
||||||
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
|
|
||||||
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
|
|
||||||
// For inscrutable reasons, __finite(), the double-precision version of
|
|
||||||
// __finitef, does not exist when compiling for MacOS. __isfinited is available
|
|
||||||
// everywhere and is just as good.
|
|
||||||
__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
|
|
||||||
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
|
|
||||||
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
|
|
||||||
|
|
||||||
#if defined(__OPENMP_NVPTX__)
|
|
||||||
#pragma omp end declare variant
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__DEVICE__ bool isgreater(float __x, float __y) {
|
|
||||||
return __builtin_isgreater(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool isgreater(double __x, double __y) {
|
|
||||||
return __builtin_isgreater(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool isgreaterequal(float __x, float __y) {
|
|
||||||
return __builtin_isgreaterequal(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool isgreaterequal(double __x, double __y) {
|
|
||||||
return __builtin_isgreaterequal(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool isless(float __x, float __y) {
|
|
||||||
return __builtin_isless(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool isless(double __x, double __y) {
|
|
||||||
return __builtin_isless(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool islessequal(float __x, float __y) {
|
|
||||||
return __builtin_islessequal(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool islessequal(double __x, double __y) {
|
|
||||||
return __builtin_islessequal(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool islessgreater(float __x, float __y) {
|
|
||||||
return __builtin_islessgreater(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool islessgreater(double __x, double __y) {
|
|
||||||
return __builtin_islessgreater(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
|
|
||||||
__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
|
|
||||||
__DEVICE__ bool isunordered(float __x, float __y) {
|
|
||||||
return __builtin_isunordered(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool isunordered(double __x, double __y) {
|
|
||||||
return __builtin_isunordered(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ float ldexp(float __arg, int __exp) {
|
|
||||||
return ::ldexpf(__arg, __exp);
|
|
||||||
}
|
|
||||||
__DEVICE__ float log(float __x) { return ::logf(__x); }
|
|
||||||
__DEVICE__ float log10(float __x) { return ::log10f(__x); }
|
|
||||||
__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
|
|
||||||
__DEVICE__ float pow(float __base, float __exp) {
|
|
||||||
return ::powf(__base, __exp);
|
|
||||||
}
|
|
||||||
__DEVICE__ float pow(float __base, int __iexp) {
|
|
||||||
return ::powif(__base, __iexp);
|
|
||||||
}
|
|
||||||
__DEVICE__ double pow(double __base, int __iexp) {
|
|
||||||
return ::powi(__base, __iexp);
|
|
||||||
}
|
|
||||||
__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
|
|
||||||
__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
|
|
||||||
__DEVICE__ float sin(float __x) { return ::sinf(__x); }
|
|
||||||
__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
|
|
||||||
__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
|
|
||||||
__DEVICE__ float tan(float __x) { return ::tanf(__x); }
|
|
||||||
__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
|
|
||||||
|
|
||||||
// There was a redefinition error for this this overload in CUDA mode.
|
|
||||||
// We restrict it to OpenMP mode for now, that is where it is actually needed
|
|
||||||
// anyway.
|
|
||||||
#ifdef __OPENMP_NVPTX__
|
|
||||||
__DEVICE__ float remquo(float __n, float __d, int *__q) {
|
|
||||||
return ::remquof(__n, __d, __q);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Notably missing above is nexttoward. We omit it because
|
|
||||||
// libdevice doesn't provide an implementation, and we don't want to be in the
|
|
||||||
// business of implementing tricky libm functions in this header.
|
|
||||||
|
|
||||||
#ifndef __OPENMP_NVPTX__
|
|
||||||
|
|
||||||
// Now we've defined everything we promised we'd define in
|
|
||||||
// __clang_cuda_math_forward_declares.h. We need to do two additional things to
|
|
||||||
// fix up our math functions.
|
|
||||||
//
|
|
||||||
// 1) Define __device__ overloads for e.g. sin(int). The CUDA headers define
|
|
||||||
// only sin(float) and sin(double), which means that e.g. sin(0) is
|
|
||||||
// ambiguous.
|
|
||||||
//
|
|
||||||
// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
|
|
||||||
// std. These are defined in the CUDA headers in the global namespace,
|
|
||||||
// independent of everything else we've done here.
|
|
||||||
|
|
||||||
// We can't use std::enable_if, because we want to be pre-C++11 compatible. But
|
|
||||||
// we go ahead and unconditionally define functions that are only available when
|
|
||||||
// compiling for C++11 to match the behavior of the CUDA headers.
|
|
||||||
template<bool __B, class __T = void>
|
|
||||||
struct __clang_cuda_enable_if {};
|
|
||||||
|
|
||||||
template <class __T> struct __clang_cuda_enable_if<true, __T> {
|
|
||||||
typedef __T type;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Defines an overload of __fn that accepts one integral argument, calls
|
|
||||||
// __fn((double)x), and returns __retty.
|
|
||||||
#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn) \
|
|
||||||
template <typename __T> \
|
|
||||||
__DEVICE__ \
|
|
||||||
typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, \
|
|
||||||
__retty>::type \
|
|
||||||
__fn(__T __x) { \
|
|
||||||
return ::__fn((double)__x); \
|
|
||||||
}
|
|
||||||
|
|
||||||
// Defines an overload of __fn that accepts one two arithmetic arguments, calls
|
|
||||||
// __fn((double)x, (double)y), and returns a double.
|
|
||||||
//
|
|
||||||
// Note this is different from OVERLOAD_1, which generates an overload that
|
|
||||||
// accepts only *integral* arguments.
|
|
||||||
#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn) \
|
|
||||||
template <typename __T1, typename __T2> \
|
|
||||||
__DEVICE__ typename __clang_cuda_enable_if< \
|
|
||||||
std::numeric_limits<__T1>::is_specialized && \
|
|
||||||
std::numeric_limits<__T2>::is_specialized, \
|
|
||||||
__retty>::type \
|
|
||||||
__fn(__T1 __x, __T2 __y) { \
|
|
||||||
return __fn((double)__x, (double)__y); \
|
|
||||||
}
|
|
||||||
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
|
|
||||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
|
|
||||||
|
|
||||||
#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
|
|
||||||
#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
|
|
||||||
|
|
||||||
// Overloads for functions that don't match the patterns expected by
|
|
||||||
// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
|
|
||||||
template <typename __T1, typename __T2, typename __T3>
|
|
||||||
__DEVICE__ typename __clang_cuda_enable_if<
|
|
||||||
std::numeric_limits<__T1>::is_specialized &&
|
|
||||||
std::numeric_limits<__T2>::is_specialized &&
|
|
||||||
std::numeric_limits<__T3>::is_specialized,
|
|
||||||
double>::type
|
|
||||||
fma(__T1 __x, __T2 __y, __T3 __z) {
|
|
||||||
return std::fma((double)__x, (double)__y, (double)__z);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename __T>
|
|
||||||
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
|
|
||||||
double>::type
|
|
||||||
frexp(__T __x, int *__exp) {
|
|
||||||
return std::frexp((double)__x, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename __T>
|
|
||||||
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
|
|
||||||
double>::type
|
|
||||||
ldexp(__T __x, int __exp) {
|
|
||||||
return std::ldexp((double)__x, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename __T1, typename __T2>
|
|
||||||
__DEVICE__ typename __clang_cuda_enable_if<
|
|
||||||
std::numeric_limits<__T1>::is_specialized &&
|
|
||||||
std::numeric_limits<__T2>::is_specialized,
|
|
||||||
double>::type
|
|
||||||
remquo(__T1 __x, __T2 __y, int *__quo) {
|
|
||||||
return std::remquo((double)__x, (double)__y, __quo);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename __T>
|
|
||||||
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
|
|
||||||
double>::type
|
|
||||||
scalbln(__T __x, long __exp) {
|
|
||||||
return std::scalbln((double)__x, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename __T>
|
|
||||||
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
|
|
||||||
double>::type
|
|
||||||
scalbn(__T __x, int __exp) {
|
|
||||||
return std::scalbn((double)__x, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
// We need to define these overloads in exactly the namespace our standard
|
|
||||||
// library uses (including the right inline namespace), otherwise they won't be
|
|
||||||
// picked up by other functions in the standard library (e.g. functions in
|
|
||||||
// <complex>). Thus the ugliness below.
|
|
||||||
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
|
|
||||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
|
||||||
#else
|
|
||||||
namespace std {
|
|
||||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
_GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Pull the new overloads we defined above into namespace std.
|
|
||||||
using ::acos;
|
|
||||||
using ::acosh;
|
|
||||||
using ::asin;
|
|
||||||
using ::asinh;
|
|
||||||
using ::atan;
|
|
||||||
using ::atan2;
|
|
||||||
using ::atanh;
|
|
||||||
using ::cbrt;
|
|
||||||
using ::ceil;
|
|
||||||
using ::copysign;
|
|
||||||
using ::cos;
|
|
||||||
using ::cosh;
|
|
||||||
using ::erf;
|
|
||||||
using ::erfc;
|
|
||||||
using ::exp;
|
|
||||||
using ::exp2;
|
|
||||||
using ::expm1;
|
|
||||||
using ::fabs;
|
|
||||||
using ::fdim;
|
|
||||||
using ::floor;
|
|
||||||
using ::fma;
|
|
||||||
using ::fmax;
|
|
||||||
using ::fmin;
|
|
||||||
using ::fmod;
|
|
||||||
using ::fpclassify;
|
|
||||||
using ::frexp;
|
|
||||||
using ::hypot;
|
|
||||||
using ::ilogb;
|
|
||||||
using ::isfinite;
|
|
||||||
using ::isgreater;
|
|
||||||
using ::isgreaterequal;
|
|
||||||
using ::isless;
|
|
||||||
using ::islessequal;
|
|
||||||
using ::islessgreater;
|
|
||||||
using ::isnormal;
|
|
||||||
using ::isunordered;
|
|
||||||
using ::ldexp;
|
|
||||||
using ::lgamma;
|
|
||||||
using ::llrint;
|
|
||||||
using ::llround;
|
|
||||||
using ::log;
|
|
||||||
using ::log10;
|
|
||||||
using ::log1p;
|
|
||||||
using ::log2;
|
|
||||||
using ::logb;
|
|
||||||
using ::lrint;
|
|
||||||
using ::lround;
|
|
||||||
using ::nearbyint;
|
|
||||||
using ::nextafter;
|
|
||||||
using ::pow;
|
|
||||||
using ::remainder;
|
|
||||||
using ::remquo;
|
|
||||||
using ::rint;
|
|
||||||
using ::round;
|
|
||||||
using ::scalbln;
|
|
||||||
using ::scalbn;
|
|
||||||
using ::signbit;
|
|
||||||
using ::sin;
|
|
||||||
using ::sinh;
|
|
||||||
using ::sqrt;
|
|
||||||
using ::tan;
|
|
||||||
using ::tanh;
|
|
||||||
using ::tgamma;
|
|
||||||
using ::trunc;
|
|
||||||
|
|
||||||
// Well this is fun: We need to pull these symbols in for libc++, but we can't
|
|
||||||
// pull them in with libstdc++, because its ::isinf and ::isnan are different
|
|
||||||
// than its std::isinf and std::isnan.
|
|
||||||
#ifndef __GLIBCXX__
|
|
||||||
using ::isinf;
|
|
||||||
using ::isnan;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Finally, pull the "foobarf" functions that CUDA defines in its headers into
|
|
||||||
// namespace std.
|
|
||||||
using ::acosf;
|
|
||||||
using ::acoshf;
|
|
||||||
using ::asinf;
|
|
||||||
using ::asinhf;
|
|
||||||
using ::atan2f;
|
|
||||||
using ::atanf;
|
|
||||||
using ::atanhf;
|
|
||||||
using ::cbrtf;
|
|
||||||
using ::ceilf;
|
|
||||||
using ::copysignf;
|
|
||||||
using ::cosf;
|
|
||||||
using ::coshf;
|
|
||||||
using ::erfcf;
|
|
||||||
using ::erff;
|
|
||||||
using ::exp2f;
|
|
||||||
using ::expf;
|
|
||||||
using ::expm1f;
|
|
||||||
using ::fabsf;
|
|
||||||
using ::fdimf;
|
|
||||||
using ::floorf;
|
|
||||||
using ::fmaf;
|
|
||||||
using ::fmaxf;
|
|
||||||
using ::fminf;
|
|
||||||
using ::fmodf;
|
|
||||||
using ::frexpf;
|
|
||||||
using ::hypotf;
|
|
||||||
using ::ilogbf;
|
|
||||||
using ::ldexpf;
|
|
||||||
using ::lgammaf;
|
|
||||||
using ::llrintf;
|
|
||||||
using ::llroundf;
|
|
||||||
using ::log10f;
|
|
||||||
using ::log1pf;
|
|
||||||
using ::log2f;
|
|
||||||
using ::logbf;
|
|
||||||
using ::logf;
|
|
||||||
using ::lrintf;
|
|
||||||
using ::lroundf;
|
|
||||||
using ::modff;
|
|
||||||
using ::nearbyintf;
|
|
||||||
using ::nextafterf;
|
|
||||||
using ::powf;
|
|
||||||
using ::remainderf;
|
|
||||||
using ::remquof;
|
|
||||||
using ::rintf;
|
|
||||||
using ::roundf;
|
|
||||||
using ::scalblnf;
|
|
||||||
using ::scalbnf;
|
|
||||||
using ::sinf;
|
|
||||||
using ::sinhf;
|
|
||||||
using ::sqrtf;
|
|
||||||
using ::tanf;
|
|
||||||
using ::tanhf;
|
|
||||||
using ::tgammaf;
|
|
||||||
using ::truncf;
|
|
||||||
|
|
||||||
#ifdef _LIBCPP_END_NAMESPACE_STD
|
|
||||||
_LIBCPP_END_NAMESPACE_STD
|
|
||||||
#else
|
|
||||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
_GLIBCXX_END_NAMESPACE_VERSION
|
|
||||||
#endif
|
|
||||||
} // namespace std
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // __OPENMP_NVPTX__
|
|
||||||
|
|
||||||
#undef __DEVICE__
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,285 +0,0 @@
|
|||||||
/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
|
|
||||||
#define __CLANG_CUDA_COMPLEX_BUILTINS
|
|
||||||
|
|
||||||
// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3. These are
|
|
||||||
// libgcc functions that clang assumes are available when compiling c99 complex
|
|
||||||
// operations. (These implementations come from libc++, and have been modified
|
|
||||||
// to work with CUDA and OpenMP target offloading [in C and C++ mode].)
|
|
||||||
|
|
||||||
#pragma push_macro("__DEVICE__")
|
|
||||||
#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
|
|
||||||
#pragma omp declare target
|
|
||||||
#define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
|
|
||||||
#else
|
|
||||||
#define __DEVICE__ __device__ inline
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// To make the algorithms available for C and C++ in CUDA and OpenMP we select
|
|
||||||
// different but equivalent function versions. TODO: For OpenMP we currently
|
|
||||||
// select the native builtins as the overload support for templates is lacking.
|
|
||||||
#if !defined(__OPENMP_NVPTX__) && !defined(__OPENMP_AMDGCN__)
|
|
||||||
#define _ISNANd std::isnan
|
|
||||||
#define _ISNANf std::isnan
|
|
||||||
#define _ISINFd std::isinf
|
|
||||||
#define _ISINFf std::isinf
|
|
||||||
#define _ISFINITEd std::isfinite
|
|
||||||
#define _ISFINITEf std::isfinite
|
|
||||||
#define _COPYSIGNd std::copysign
|
|
||||||
#define _COPYSIGNf std::copysign
|
|
||||||
#define _SCALBNd std::scalbn
|
|
||||||
#define _SCALBNf std::scalbn
|
|
||||||
#define _ABSd std::abs
|
|
||||||
#define _ABSf std::abs
|
|
||||||
#define _LOGBd std::logb
|
|
||||||
#define _LOGBf std::logb
|
|
||||||
// Rather than pulling in std::max from algorithm everytime, use available ::max.
|
|
||||||
#define _fmaxd max
|
|
||||||
#define _fmaxf max
|
|
||||||
#else
|
|
||||||
#ifdef __AMDGCN__
|
|
||||||
#define _ISNANd __ocml_isnan_f64
|
|
||||||
#define _ISNANf __ocml_isnan_f32
|
|
||||||
#define _ISINFd __ocml_isinf_f64
|
|
||||||
#define _ISINFf __ocml_isinf_f32
|
|
||||||
#define _ISFINITEd __ocml_isfinite_f64
|
|
||||||
#define _ISFINITEf __ocml_isfinite_f32
|
|
||||||
#define _COPYSIGNd __ocml_copysign_f64
|
|
||||||
#define _COPYSIGNf __ocml_copysign_f32
|
|
||||||
#define _SCALBNd __ocml_scalbn_f64
|
|
||||||
#define _SCALBNf __ocml_scalbn_f32
|
|
||||||
#define _ABSd __ocml_fabs_f64
|
|
||||||
#define _ABSf __ocml_fabs_f32
|
|
||||||
#define _LOGBd __ocml_logb_f64
|
|
||||||
#define _LOGBf __ocml_logb_f32
|
|
||||||
#define _fmaxd __ocml_fmax_f64
|
|
||||||
#define _fmaxf __ocml_fmax_f32
|
|
||||||
#else
|
|
||||||
#define _ISNANd __nv_isnand
|
|
||||||
#define _ISNANf __nv_isnanf
|
|
||||||
#define _ISINFd __nv_isinfd
|
|
||||||
#define _ISINFf __nv_isinff
|
|
||||||
#define _ISFINITEd __nv_isfinited
|
|
||||||
#define _ISFINITEf __nv_finitef
|
|
||||||
#define _COPYSIGNd __nv_copysign
|
|
||||||
#define _COPYSIGNf __nv_copysignf
|
|
||||||
#define _SCALBNd __nv_scalbn
|
|
||||||
#define _SCALBNf __nv_scalbnf
|
|
||||||
#define _ABSd __nv_fabs
|
|
||||||
#define _ABSf __nv_fabsf
|
|
||||||
#define _LOGBd __nv_logb
|
|
||||||
#define _LOGBf __nv_logbf
|
|
||||||
#define _fmaxd __nv_fmax
|
|
||||||
#define _fmaxf __nv_fmaxf
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__DEVICE__ double _Complex __muldc3(double __a, double __b, double __c,
|
|
||||||
double __d) {
|
|
||||||
double __ac = __a * __c;
|
|
||||||
double __bd = __b * __d;
|
|
||||||
double __ad = __a * __d;
|
|
||||||
double __bc = __b * __c;
|
|
||||||
double _Complex z;
|
|
||||||
__real__(z) = __ac - __bd;
|
|
||||||
__imag__(z) = __ad + __bc;
|
|
||||||
if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
|
|
||||||
int __recalc = 0;
|
|
||||||
if (_ISINFd(__a) || _ISINFd(__b)) {
|
|
||||||
__a = _COPYSIGNd(_ISINFd(__a) ? 1 : 0, __a);
|
|
||||||
__b = _COPYSIGNd(_ISINFd(__b) ? 1 : 0, __b);
|
|
||||||
if (_ISNANd(__c))
|
|
||||||
__c = _COPYSIGNd(0, __c);
|
|
||||||
if (_ISNANd(__d))
|
|
||||||
__d = _COPYSIGNd(0, __d);
|
|
||||||
__recalc = 1;
|
|
||||||
}
|
|
||||||
if (_ISINFd(__c) || _ISINFd(__d)) {
|
|
||||||
__c = _COPYSIGNd(_ISINFd(__c) ? 1 : 0, __c);
|
|
||||||
__d = _COPYSIGNd(_ISINFd(__d) ? 1 : 0, __d);
|
|
||||||
if (_ISNANd(__a))
|
|
||||||
__a = _COPYSIGNd(0, __a);
|
|
||||||
if (_ISNANd(__b))
|
|
||||||
__b = _COPYSIGNd(0, __b);
|
|
||||||
__recalc = 1;
|
|
||||||
}
|
|
||||||
if (!__recalc &&
|
|
||||||
(_ISINFd(__ac) || _ISINFd(__bd) || _ISINFd(__ad) || _ISINFd(__bc))) {
|
|
||||||
if (_ISNANd(__a))
|
|
||||||
__a = _COPYSIGNd(0, __a);
|
|
||||||
if (_ISNANd(__b))
|
|
||||||
__b = _COPYSIGNd(0, __b);
|
|
||||||
if (_ISNANd(__c))
|
|
||||||
__c = _COPYSIGNd(0, __c);
|
|
||||||
if (_ISNANd(__d))
|
|
||||||
__d = _COPYSIGNd(0, __d);
|
|
||||||
__recalc = 1;
|
|
||||||
}
|
|
||||||
if (__recalc) {
|
|
||||||
// Can't use std::numeric_limits<double>::infinity() -- that doesn't have
|
|
||||||
// a device overload (and isn't constexpr before C++11, naturally).
|
|
||||||
__real__(z) = __builtin_huge_val() * (__a * __c - __b * __d);
|
|
||||||
__imag__(z) = __builtin_huge_val() * (__a * __d + __b * __c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return z;
|
|
||||||
}
|
|
||||||
|
|
||||||
__DEVICE__ float _Complex __mulsc3(float __a, float __b, float __c, float __d) {
|
|
||||||
float __ac = __a * __c;
|
|
||||||
float __bd = __b * __d;
|
|
||||||
float __ad = __a * __d;
|
|
||||||
float __bc = __b * __c;
|
|
||||||
float _Complex z;
|
|
||||||
__real__(z) = __ac - __bd;
|
|
||||||
__imag__(z) = __ad + __bc;
|
|
||||||
if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
|
|
||||||
int __recalc = 0;
|
|
||||||
if (_ISINFf(__a) || _ISINFf(__b)) {
|
|
||||||
__a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
|
|
||||||
__b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
|
|
||||||
if (_ISNANf(__c))
|
|
||||||
__c = _COPYSIGNf(0, __c);
|
|
||||||
if (_ISNANf(__d))
|
|
||||||
__d = _COPYSIGNf(0, __d);
|
|
||||||
__recalc = 1;
|
|
||||||
}
|
|
||||||
if (_ISINFf(__c) || _ISINFf(__d)) {
|
|
||||||
__c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
|
|
||||||
__d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
|
|
||||||
if (_ISNANf(__a))
|
|
||||||
__a = _COPYSIGNf(0, __a);
|
|
||||||
if (_ISNANf(__b))
|
|
||||||
__b = _COPYSIGNf(0, __b);
|
|
||||||
__recalc = 1;
|
|
||||||
}
|
|
||||||
if (!__recalc &&
|
|
||||||
(_ISINFf(__ac) || _ISINFf(__bd) || _ISINFf(__ad) || _ISINFf(__bc))) {
|
|
||||||
if (_ISNANf(__a))
|
|
||||||
__a = _COPYSIGNf(0, __a);
|
|
||||||
if (_ISNANf(__b))
|
|
||||||
__b = _COPYSIGNf(0, __b);
|
|
||||||
if (_ISNANf(__c))
|
|
||||||
__c = _COPYSIGNf(0, __c);
|
|
||||||
if (_ISNANf(__d))
|
|
||||||
__d = _COPYSIGNf(0, __d);
|
|
||||||
__recalc = 1;
|
|
||||||
}
|
|
||||||
if (__recalc) {
|
|
||||||
__real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
|
|
||||||
__imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return z;
|
|
||||||
}
|
|
||||||
|
|
||||||
__DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
|
|
||||||
double __d) {
|
|
||||||
int __ilogbw = 0;
|
|
||||||
// Can't use std::max, because that's defined in <algorithm>, and we don't
|
|
||||||
// want to pull that in for every compile. The CUDA headers define
|
|
||||||
// ::max(float, float) and ::max(double, double), which is sufficient for us.
|
|
||||||
double __logbw = _LOGBd(_fmaxd(_ABSd(__c), _ABSd(__d)));
|
|
||||||
if (_ISFINITEd(__logbw)) {
|
|
||||||
__ilogbw = (int)__logbw;
|
|
||||||
__c = _SCALBNd(__c, -__ilogbw);
|
|
||||||
__d = _SCALBNd(__d, -__ilogbw);
|
|
||||||
}
|
|
||||||
double __denom = __c * __c + __d * __d;
|
|
||||||
double _Complex z;
|
|
||||||
__real__(z) = _SCALBNd((__a * __c + __b * __d) / __denom, -__ilogbw);
|
|
||||||
__imag__(z) = _SCALBNd((__b * __c - __a * __d) / __denom, -__ilogbw);
|
|
||||||
if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
|
|
||||||
if ((__denom == 0.0) && (!_ISNANd(__a) || !_ISNANd(__b))) {
|
|
||||||
__real__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __a;
|
|
||||||
__imag__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __b;
|
|
||||||
} else if ((_ISINFd(__a) || _ISINFd(__b)) && _ISFINITEd(__c) &&
|
|
||||||
_ISFINITEd(__d)) {
|
|
||||||
__a = _COPYSIGNd(_ISINFd(__a) ? 1.0 : 0.0, __a);
|
|
||||||
__b = _COPYSIGNd(_ISINFd(__b) ? 1.0 : 0.0, __b);
|
|
||||||
__real__(z) = __builtin_huge_val() * (__a * __c + __b * __d);
|
|
||||||
__imag__(z) = __builtin_huge_val() * (__b * __c - __a * __d);
|
|
||||||
} else if (_ISINFd(__logbw) && __logbw > 0.0 && _ISFINITEd(__a) &&
|
|
||||||
_ISFINITEd(__b)) {
|
|
||||||
__c = _COPYSIGNd(_ISINFd(__c) ? 1.0 : 0.0, __c);
|
|
||||||
__d = _COPYSIGNd(_ISINFd(__d) ? 1.0 : 0.0, __d);
|
|
||||||
__real__(z) = 0.0 * (__a * __c + __b * __d);
|
|
||||||
__imag__(z) = 0.0 * (__b * __c - __a * __d);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return z;
|
|
||||||
}
|
|
||||||
|
|
||||||
__DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
|
|
||||||
int __ilogbw = 0;
|
|
||||||
float __logbw = _LOGBf(_fmaxf(_ABSf(__c), _ABSf(__d)));
|
|
||||||
if (_ISFINITEf(__logbw)) {
|
|
||||||
__ilogbw = (int)__logbw;
|
|
||||||
__c = _SCALBNf(__c, -__ilogbw);
|
|
||||||
__d = _SCALBNf(__d, -__ilogbw);
|
|
||||||
}
|
|
||||||
float __denom = __c * __c + __d * __d;
|
|
||||||
float _Complex z;
|
|
||||||
__real__(z) = _SCALBNf((__a * __c + __b * __d) / __denom, -__ilogbw);
|
|
||||||
__imag__(z) = _SCALBNf((__b * __c - __a * __d) / __denom, -__ilogbw);
|
|
||||||
if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
|
|
||||||
if ((__denom == 0) && (!_ISNANf(__a) || !_ISNANf(__b))) {
|
|
||||||
__real__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __a;
|
|
||||||
__imag__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __b;
|
|
||||||
} else if ((_ISINFf(__a) || _ISINFf(__b)) && _ISFINITEf(__c) &&
|
|
||||||
_ISFINITEf(__d)) {
|
|
||||||
__a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
|
|
||||||
__b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
|
|
||||||
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
|
|
||||||
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
|
|
||||||
} else if (_ISINFf(__logbw) && __logbw > 0 && _ISFINITEf(__a) &&
|
|
||||||
_ISFINITEf(__b)) {
|
|
||||||
__c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
|
|
||||||
__d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
|
|
||||||
__real__(z) = 0 * (__a * __c + __b * __d);
|
|
||||||
__imag__(z) = 0 * (__b * __c - __a * __d);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return z;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
} // extern "C"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef _ISNANd
|
|
||||||
#undef _ISNANf
|
|
||||||
#undef _ISINFd
|
|
||||||
#undef _ISINFf
|
|
||||||
#undef _COPYSIGNd
|
|
||||||
#undef _COPYSIGNf
|
|
||||||
#undef _ISFINITEd
|
|
||||||
#undef _ISFINITEf
|
|
||||||
#undef _SCALBNd
|
|
||||||
#undef _SCALBNf
|
|
||||||
#undef _ABSd
|
|
||||||
#undef _ABSf
|
|
||||||
#undef _LOGBd
|
|
||||||
#undef _LOGBf
|
|
||||||
#undef _fmaxd
|
|
||||||
#undef _fmaxf
|
|
||||||
|
|
||||||
#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
|
|
||||||
#pragma omp end declare target
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#pragma pop_macro("__DEVICE__")
|
|
||||||
|
|
||||||
#endif // __CLANG_CUDA_COMPLEX_BUILTINS
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,994 +0,0 @@
|
|||||||
/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __CLANG_CUDA_INTRINSICS_H__
|
|
||||||
#define __CLANG_CUDA_INTRINSICS_H__
|
|
||||||
#ifndef __CUDA__
|
|
||||||
#error "This file is for CUDA compilation only."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// sm_30 intrinsics: __shfl_{up,down,xor}.
|
|
||||||
|
|
||||||
#define __SM_30_INTRINSICS_H__
|
|
||||||
#define __SM_30_INTRINSICS_HPP__
|
|
||||||
|
|
||||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
|
||||||
|
|
||||||
#pragma push_macro("__MAKE_SHUFFLES")
|
|
||||||
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \
|
|
||||||
__Type) \
|
|
||||||
inline __device__ int __FnName(int __val, __Type __offset, \
|
|
||||||
int __width = warpSize) { \
|
|
||||||
return __IntIntrinsic(__val, __offset, \
|
|
||||||
((warpSize - __width) << 8) | (__Mask)); \
|
|
||||||
} \
|
|
||||||
inline __device__ float __FnName(float __val, __Type __offset, \
|
|
||||||
int __width = warpSize) { \
|
|
||||||
return __FloatIntrinsic(__val, __offset, \
|
|
||||||
((warpSize - __width) << 8) | (__Mask)); \
|
|
||||||
} \
|
|
||||||
inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
|
|
||||||
int __width = warpSize) { \
|
|
||||||
return static_cast<unsigned int>( \
|
|
||||||
::__FnName(static_cast<int>(__val), __offset, __width)); \
|
|
||||||
} \
|
|
||||||
inline __device__ long long __FnName(long long __val, __Type __offset, \
|
|
||||||
int __width = warpSize) { \
|
|
||||||
struct __Bits { \
|
|
||||||
int __a, __b; \
|
|
||||||
}; \
|
|
||||||
_Static_assert(sizeof(__val) == sizeof(__Bits)); \
|
|
||||||
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
|
|
||||||
__Bits __tmp; \
|
|
||||||
memcpy(&__tmp, &__val, sizeof(__val)); \
|
|
||||||
__tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
|
|
||||||
__tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
|
|
||||||
long long __ret; \
|
|
||||||
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
|
|
||||||
return __ret; \
|
|
||||||
} \
|
|
||||||
inline __device__ long __FnName(long __val, __Type __offset, \
|
|
||||||
int __width = warpSize) { \
|
|
||||||
_Static_assert(sizeof(long) == sizeof(long long) || \
|
|
||||||
sizeof(long) == sizeof(int)); \
|
|
||||||
if (sizeof(long) == sizeof(long long)) { \
|
|
||||||
return static_cast<long>( \
|
|
||||||
::__FnName(static_cast<long long>(__val), __offset, __width)); \
|
|
||||||
} else if (sizeof(long) == sizeof(int)) { \
|
|
||||||
return static_cast<long>( \
|
|
||||||
::__FnName(static_cast<int>(__val), __offset, __width)); \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
inline __device__ unsigned long __FnName( \
|
|
||||||
unsigned long __val, __Type __offset, int __width = warpSize) { \
|
|
||||||
return static_cast<unsigned long>( \
|
|
||||||
::__FnName(static_cast<long>(__val), __offset, __width)); \
|
|
||||||
} \
|
|
||||||
inline __device__ unsigned long long __FnName( \
|
|
||||||
unsigned long long __val, __Type __offset, int __width = warpSize) { \
|
|
||||||
return static_cast<unsigned long long>( \
|
|
||||||
::__FnName(static_cast<long long>(__val), __offset, __width)); \
|
|
||||||
} \
|
|
||||||
inline __device__ double __FnName(double __val, __Type __offset, \
|
|
||||||
int __width = warpSize) { \
|
|
||||||
long long __tmp; \
|
|
||||||
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
|
|
||||||
memcpy(&__tmp, &__val, sizeof(__val)); \
|
|
||||||
__tmp = ::__FnName(__tmp, __offset, __width); \
|
|
||||||
double __ret; \
|
|
||||||
memcpy(&__ret, &__tmp, sizeof(__ret)); \
|
|
||||||
return __ret; \
|
|
||||||
}
|
|
||||||
|
|
||||||
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
|
|
||||||
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
|
|
||||||
// maxLane.
|
|
||||||
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
|
|
||||||
unsigned int);
|
|
||||||
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
|
|
||||||
unsigned int);
|
|
||||||
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
|
|
||||||
int);
|
|
||||||
#pragma pop_macro("__MAKE_SHUFFLES")
|
|
||||||
|
|
||||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
|
||||||
|
|
||||||
#if CUDA_VERSION >= 9000
|
|
||||||
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
|
|
||||||
// __shfl_sync_* variants available in CUDA-9
|
|
||||||
#pragma push_macro("__MAKE_SYNC_SHUFFLES")
|
|
||||||
#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
|
|
||||||
__Mask, __Type) \
|
|
||||||
inline __device__ int __FnName(unsigned int __mask, int __val, \
|
|
||||||
__Type __offset, int __width = warpSize) { \
|
|
||||||
return __IntIntrinsic(__mask, __val, __offset, \
|
|
||||||
((warpSize - __width) << 8) | (__Mask)); \
|
|
||||||
} \
|
|
||||||
inline __device__ float __FnName(unsigned int __mask, float __val, \
|
|
||||||
__Type __offset, int __width = warpSize) { \
|
|
||||||
return __FloatIntrinsic(__mask, __val, __offset, \
|
|
||||||
((warpSize - __width) << 8) | (__Mask)); \
|
|
||||||
} \
|
|
||||||
inline __device__ unsigned int __FnName(unsigned int __mask, \
|
|
||||||
unsigned int __val, __Type __offset, \
|
|
||||||
int __width = warpSize) { \
|
|
||||||
return static_cast<unsigned int>( \
|
|
||||||
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
|
|
||||||
} \
|
|
||||||
inline __device__ long long __FnName(unsigned int __mask, long long __val, \
|
|
||||||
__Type __offset, \
|
|
||||||
int __width = warpSize) { \
|
|
||||||
struct __Bits { \
|
|
||||||
int __a, __b; \
|
|
||||||
}; \
|
|
||||||
_Static_assert(sizeof(__val) == sizeof(__Bits)); \
|
|
||||||
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
|
|
||||||
__Bits __tmp; \
|
|
||||||
memcpy(&__tmp, &__val, sizeof(__val)); \
|
|
||||||
__tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
|
|
||||||
__tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
|
|
||||||
long long __ret; \
|
|
||||||
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
|
|
||||||
return __ret; \
|
|
||||||
} \
|
|
||||||
inline __device__ unsigned long long __FnName( \
|
|
||||||
unsigned int __mask, unsigned long long __val, __Type __offset, \
|
|
||||||
int __width = warpSize) { \
|
|
||||||
return static_cast<unsigned long long>( \
|
|
||||||
::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
|
|
||||||
} \
|
|
||||||
inline __device__ long __FnName(unsigned int __mask, long __val, \
|
|
||||||
__Type __offset, int __width = warpSize) { \
|
|
||||||
_Static_assert(sizeof(long) == sizeof(long long) || \
|
|
||||||
sizeof(long) == sizeof(int)); \
|
|
||||||
if (sizeof(long) == sizeof(long long)) { \
|
|
||||||
return static_cast<long>(::__FnName( \
|
|
||||||
__mask, static_cast<long long>(__val), __offset, __width)); \
|
|
||||||
} else if (sizeof(long) == sizeof(int)) { \
|
|
||||||
return static_cast<long>( \
|
|
||||||
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
inline __device__ unsigned long __FnName( \
|
|
||||||
unsigned int __mask, unsigned long __val, __Type __offset, \
|
|
||||||
int __width = warpSize) { \
|
|
||||||
return static_cast<unsigned long>( \
|
|
||||||
::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \
|
|
||||||
} \
|
|
||||||
inline __device__ double __FnName(unsigned int __mask, double __val, \
|
|
||||||
__Type __offset, int __width = warpSize) { \
|
|
||||||
long long __tmp; \
|
|
||||||
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
|
|
||||||
memcpy(&__tmp, &__val, sizeof(__val)); \
|
|
||||||
__tmp = ::__FnName(__mask, __tmp, __offset, __width); \
|
|
||||||
double __ret; \
|
|
||||||
memcpy(&__ret, &__tmp, sizeof(__ret)); \
|
|
||||||
return __ret; \
|
|
||||||
}
|
|
||||||
__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
|
|
||||||
__nvvm_shfl_sync_idx_f32, 0x1f, int);
|
|
||||||
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
|
|
||||||
// maxLane.
|
|
||||||
__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
|
|
||||||
__nvvm_shfl_sync_up_f32, 0, unsigned int);
|
|
||||||
__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
|
|
||||||
__nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
|
|
||||||
__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
|
|
||||||
__nvvm_shfl_sync_bfly_f32, 0x1f, int);
|
|
||||||
#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
|
|
||||||
|
|
||||||
inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
|
|
||||||
return __nvvm_bar_warp_sync(mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ void __barrier_sync(unsigned int id) {
|
|
||||||
__nvvm_barrier_sync(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ void __barrier_sync_count(unsigned int id,
|
|
||||||
unsigned int count) {
|
|
||||||
__nvvm_barrier_sync_cnt(id, count);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ int __all_sync(unsigned int mask, int pred) {
|
|
||||||
return __nvvm_vote_all_sync(mask, pred);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ int __any_sync(unsigned int mask, int pred) {
|
|
||||||
return __nvvm_vote_any_sync(mask, pred);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ int __uni_sync(unsigned int mask, int pred) {
|
|
||||||
return __nvvm_vote_uni_sync(mask, pred);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
|
|
||||||
return __nvvm_vote_ballot_sync(mask, pred);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ unsigned int __activemask() {
|
|
||||||
#if CUDA_VERSION < 9020
|
|
||||||
return __nvvm_vote_ballot(1);
|
|
||||||
#else
|
|
||||||
return __nvvm_activemask();
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
|
|
||||||
return __nvvm_fns(mask, base, offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
|
||||||
|
|
||||||
// Define __match* builtins CUDA-9 headers expect to see.
|
|
||||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
|
|
||||||
inline __device__ unsigned int __match32_any_sync(unsigned int mask,
|
|
||||||
unsigned int value) {
|
|
||||||
return __nvvm_match_any_sync_i32(mask, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ unsigned int
|
|
||||||
__match64_any_sync(unsigned int mask, unsigned long long value) {
|
|
||||||
return __nvvm_match_any_sync_i64(mask, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ unsigned int
|
|
||||||
__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
|
|
||||||
return __nvvm_match_all_sync_i32p(mask, value, pred);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ unsigned int
|
|
||||||
__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
|
|
||||||
return __nvvm_match_all_sync_i64p(mask, value, pred);
|
|
||||||
}
|
|
||||||
#include "crt/sm_70_rt.hpp"
|
|
||||||
|
|
||||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
|
|
||||||
#endif // __CUDA_VERSION >= 9000
|
|
||||||
|
|
||||||
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
|
|
||||||
|
|
||||||
// Prevent the vanilla sm_32 intrinsics header from being included.
|
|
||||||
#define __SM_32_INTRINSICS_H__
|
|
||||||
#define __SM_32_INTRINSICS_HPP__
|
|
||||||
|
|
||||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
|
|
||||||
|
|
||||||
inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
|
|
||||||
inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
|
|
||||||
inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
|
|
||||||
inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
|
|
||||||
inline __device__ long long __ldg(const long long *ptr) {
|
|
||||||
return __nvvm_ldg_ll(ptr);
|
|
||||||
}
|
|
||||||
inline __device__ unsigned char __ldg(const unsigned char *ptr) {
|
|
||||||
return __nvvm_ldg_uc(ptr);
|
|
||||||
}
|
|
||||||
inline __device__ signed char __ldg(const signed char *ptr) {
|
|
||||||
return __nvvm_ldg_uc((const unsigned char *)ptr);
|
|
||||||
}
|
|
||||||
inline __device__ unsigned short __ldg(const unsigned short *ptr) {
|
|
||||||
return __nvvm_ldg_us(ptr);
|
|
||||||
}
|
|
||||||
inline __device__ unsigned int __ldg(const unsigned int *ptr) {
|
|
||||||
return __nvvm_ldg_ui(ptr);
|
|
||||||
}
|
|
||||||
inline __device__ unsigned long __ldg(const unsigned long *ptr) {
|
|
||||||
return __nvvm_ldg_ul(ptr);
|
|
||||||
}
|
|
||||||
inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
|
|
||||||
return __nvvm_ldg_ull(ptr);
|
|
||||||
}
|
|
||||||
inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
|
|
||||||
inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
|
|
||||||
|
|
||||||
inline __device__ char2 __ldg(const char2 *ptr) {
|
|
||||||
typedef char c2 __attribute__((ext_vector_type(2)));
|
|
||||||
// We can assume that ptr is aligned at least to char2's alignment, but the
|
|
||||||
// load will assume that ptr is aligned to char2's alignment. This is only
|
|
||||||
// safe if alignof(c2) <= alignof(char2).
|
|
||||||
c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
|
|
||||||
char2 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ char4 __ldg(const char4 *ptr) {
|
|
||||||
typedef char c4 __attribute__((ext_vector_type(4)));
|
|
||||||
c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
|
|
||||||
char4 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
ret.z = rv[2];
|
|
||||||
ret.w = rv[3];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ short2 __ldg(const short2 *ptr) {
|
|
||||||
typedef short s2 __attribute__((ext_vector_type(2)));
|
|
||||||
s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
|
|
||||||
short2 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ short4 __ldg(const short4 *ptr) {
|
|
||||||
typedef short s4 __attribute__((ext_vector_type(4)));
|
|
||||||
s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
|
|
||||||
short4 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
ret.z = rv[2];
|
|
||||||
ret.w = rv[3];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ int2 __ldg(const int2 *ptr) {
|
|
||||||
typedef int i2 __attribute__((ext_vector_type(2)));
|
|
||||||
i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
|
|
||||||
int2 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ int4 __ldg(const int4 *ptr) {
|
|
||||||
typedef int i4 __attribute__((ext_vector_type(4)));
|
|
||||||
i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
|
|
||||||
int4 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
ret.z = rv[2];
|
|
||||||
ret.w = rv[3];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ longlong2 __ldg(const longlong2 *ptr) {
|
|
||||||
typedef long long ll2 __attribute__((ext_vector_type(2)));
|
|
||||||
ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
|
|
||||||
longlong2 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ uchar2 __ldg(const uchar2 *ptr) {
|
|
||||||
typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
|
|
||||||
uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
|
|
||||||
uchar2 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ uchar4 __ldg(const uchar4 *ptr) {
|
|
||||||
typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
|
|
||||||
uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
|
|
||||||
uchar4 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
ret.z = rv[2];
|
|
||||||
ret.w = rv[3];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ ushort2 __ldg(const ushort2 *ptr) {
|
|
||||||
typedef unsigned short us2 __attribute__((ext_vector_type(2)));
|
|
||||||
us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
|
|
||||||
ushort2 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ ushort4 __ldg(const ushort4 *ptr) {
|
|
||||||
typedef unsigned short us4 __attribute__((ext_vector_type(4)));
|
|
||||||
us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
|
|
||||||
ushort4 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
ret.z = rv[2];
|
|
||||||
ret.w = rv[3];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ uint2 __ldg(const uint2 *ptr) {
|
|
||||||
typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
|
|
||||||
ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
|
|
||||||
uint2 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ uint4 __ldg(const uint4 *ptr) {
|
|
||||||
typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
|
|
||||||
ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
|
|
||||||
uint4 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
ret.z = rv[2];
|
|
||||||
ret.w = rv[3];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
|
|
||||||
typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
|
|
||||||
ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
|
|
||||||
ulonglong2 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ float2 __ldg(const float2 *ptr) {
|
|
||||||
typedef float f2 __attribute__((ext_vector_type(2)));
|
|
||||||
f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
|
|
||||||
float2 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ float4 __ldg(const float4 *ptr) {
|
|
||||||
typedef float f4 __attribute__((ext_vector_type(4)));
|
|
||||||
f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
|
|
||||||
float4 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
ret.z = rv[2];
|
|
||||||
ret.w = rv[3];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
inline __device__ double2 __ldg(const double2 *ptr) {
|
|
||||||
typedef double d2 __attribute__((ext_vector_type(2)));
|
|
||||||
d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
|
|
||||||
double2 ret;
|
|
||||||
ret.x = rv[0];
|
|
||||||
ret.y = rv[1];
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: Implement these as intrinsics, so the backend can work its magic on
|
|
||||||
// these. Alternatively, we could implement these as plain C and try to get
|
|
||||||
// llvm to recognize the relevant patterns.
|
|
||||||
inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
|
|
||||||
unsigned shiftWidth) {
|
|
||||||
unsigned result;
|
|
||||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;"
|
|
||||||
: "=r"(result)
|
|
||||||
: "r"(low32), "r"(high32), "r"(shiftWidth));
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
|
|
||||||
unsigned shiftWidth) {
|
|
||||||
unsigned result;
|
|
||||||
asm("shf.l.clamp.b32 %0, %1, %2, %3;"
|
|
||||||
: "=r"(result)
|
|
||||||
: "r"(low32), "r"(high32), "r"(shiftWidth));
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
|
|
||||||
unsigned shiftWidth) {
|
|
||||||
unsigned result;
|
|
||||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;"
|
|
||||||
: "=r"(result)
|
|
||||||
: "r"(low32), "r"(high32), "r"(shiftWidth));
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
|
|
||||||
unsigned shiftWidth) {
|
|
||||||
unsigned ret;
|
|
||||||
asm("shf.r.clamp.b32 %0, %1, %2, %3;"
|
|
||||||
: "=r"(ret)
|
|
||||||
: "r"(low32), "r"(high32), "r"(shiftWidth));
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
|
||||||
|
|
||||||
#pragma push_macro("__INTRINSIC_LOAD")
|
|
||||||
#define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
|
|
||||||
__Clobber) \
|
|
||||||
inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
|
|
||||||
__TmpType __ret; \
|
|
||||||
asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber); \
|
|
||||||
return (__DeclType)__ret; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma push_macro("__INTRINSIC_LOAD2")
|
|
||||||
#define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
|
|
||||||
__Clobber) \
|
|
||||||
inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
|
|
||||||
__DeclType __ret; \
|
|
||||||
__TmpType __tmp; \
|
|
||||||
asm(__AsmOp " {%0,%1}, [%2];" \
|
|
||||||
: __AsmType(__tmp.x), __AsmType(__tmp.y) \
|
|
||||||
: "l"(__ptr)__Clobber); \
|
|
||||||
using __ElementType = decltype(__ret.x); \
|
|
||||||
__ret.x = (__ElementType)(__tmp.x); \
|
|
||||||
__ret.y = (__ElementType)__tmp.y; \
|
|
||||||
return __ret; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma push_macro("__INTRINSIC_LOAD4")
|
|
||||||
#define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
|
|
||||||
__Clobber) \
|
|
||||||
inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
|
|
||||||
__DeclType __ret; \
|
|
||||||
__TmpType __tmp; \
|
|
||||||
asm(__AsmOp " {%0,%1,%2,%3}, [%4];" \
|
|
||||||
: __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z), \
|
|
||||||
__AsmType(__tmp.w) \
|
|
||||||
: "l"(__ptr)__Clobber); \
|
|
||||||
using __ElementType = decltype(__ret.x); \
|
|
||||||
__ret.x = (__ElementType)__tmp.x; \
|
|
||||||
__ret.y = (__ElementType)__tmp.y; \
|
|
||||||
__ret.z = (__ElementType)__tmp.z; \
|
|
||||||
__ret.w = (__ElementType)__tmp.w; \
|
|
||||||
return __ret; \
|
|
||||||
}
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r", );
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r", );
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h", );
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r", );
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long,
|
|
||||||
"=l", );
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l", );
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int,
|
|
||||||
"=r", );
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short,
|
|
||||||
"=h", );
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int,
|
|
||||||
"=r", );
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long,
|
|
||||||
unsigned long long, "=l", );
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2,
|
|
||||||
"=l", );
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f", );
|
|
||||||
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d", );
|
|
||||||
|
|
||||||
inline __device__ long __ldcg(const long *__ptr) {
|
|
||||||
unsigned long __ret;
|
|
||||||
if (sizeof(long) == 8) {
|
|
||||||
asm("ld.global.cg.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
|
|
||||||
} else {
|
|
||||||
asm("ld.global.cg.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
|
|
||||||
}
|
|
||||||
return (long)__ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int,
|
|
||||||
"=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short,
|
|
||||||
"=h", : "memory");
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int,
|
|
||||||
"=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long,
|
|
||||||
unsigned long long, "=l", : "memory");
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int,
|
|
||||||
"=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int,
|
|
||||||
"=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short,
|
|
||||||
"=h", : "memory");
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int,
|
|
||||||
"=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long,
|
|
||||||
"=l", : "memory");
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2,
|
|
||||||
"=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4,
|
|
||||||
"=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2,
|
|
||||||
"=h", : "memory");
|
|
||||||
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4,
|
|
||||||
"=h", : "memory");
|
|
||||||
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2,
|
|
||||||
"=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4,
|
|
||||||
"=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2,
|
|
||||||
"=l", : "memory");
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2,
|
|
||||||
"=h", : "memory");
|
|
||||||
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4,
|
|
||||||
"=h", : "memory");
|
|
||||||
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r", : "memory");
|
|
||||||
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2,
|
|
||||||
"=l", : "memory");
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f", : "memory");
|
|
||||||
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d", : "memory");
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2,
|
|
||||||
"=f", : "memory");
|
|
||||||
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4,
|
|
||||||
"=f", : "memory");
|
|
||||||
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2,
|
|
||||||
"=d", : "memory");
|
|
||||||
|
|
||||||
inline __device__ long __ldcv(const long *__ptr) {
|
|
||||||
unsigned long __ret;
|
|
||||||
if (sizeof(long) == 8) {
|
|
||||||
asm("ld.global.cv.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
|
|
||||||
} else {
|
|
||||||
asm("ld.global.cv.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
|
|
||||||
}
|
|
||||||
return (long)__ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r", );
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r", );
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h", );
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r", );
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long,
|
|
||||||
"=l", );
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l", );
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int,
|
|
||||||
"=r", );
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short,
|
|
||||||
"=h", );
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int,
|
|
||||||
"=r", );
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long,
|
|
||||||
unsigned long long, "=l", );
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2,
|
|
||||||
"=l", );
|
|
||||||
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f", );
|
|
||||||
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f", );
|
|
||||||
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f", );
|
|
||||||
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d", );
|
|
||||||
|
|
||||||
#pragma pop_macro("__INTRINSIC_LOAD")
|
|
||||||
#pragma pop_macro("__INTRINSIC_LOAD2")
|
|
||||||
#pragma pop_macro("__INTRINSIC_LOAD4")
|
|
||||||
|
|
||||||
inline __device__ long __ldcs(const long *__ptr) {
|
|
||||||
unsigned long __ret;
|
|
||||||
if (sizeof(long) == 8) {
|
|
||||||
asm("ld.global.cs.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
|
|
||||||
} else {
|
|
||||||
asm("ld.global.cs.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
|
|
||||||
}
|
|
||||||
return (long)__ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma push_macro("__INTRINSIC_STORE")
|
|
||||||
#define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \
|
|
||||||
inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
|
|
||||||
__TmpType __tmp = (__TmpType)__value; \
|
|
||||||
asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma push_macro("__INTRINSIC_STORE2")
|
|
||||||
#define __INTRINSIC_STORE2(__FnName, __AsmOp, __DeclType, __TmpType, \
|
|
||||||
__AsmType) \
|
|
||||||
inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
|
|
||||||
__TmpType __tmp; \
|
|
||||||
using __ElementType = decltype(__tmp.x); \
|
|
||||||
__tmp.x = (__ElementType)(__value.x); \
|
|
||||||
__tmp.y = (__ElementType)(__value.y); \
|
|
||||||
asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \
|
|
||||||
__AsmType(__tmp.y) \
|
|
||||||
: "memory"); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma push_macro("__INTRINSIC_STORE4")
|
|
||||||
#define __INTRINSIC_STORE4(__FnName, __AsmOp, __DeclType, __TmpType, \
|
|
||||||
__AsmType) \
|
|
||||||
inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
|
|
||||||
__TmpType __tmp; \
|
|
||||||
using __ElementType = decltype(__tmp.x); \
|
|
||||||
__tmp.x = (__ElementType)(__value.x); \
|
|
||||||
__tmp.y = (__ElementType)(__value.y); \
|
|
||||||
__tmp.z = (__ElementType)(__value.z); \
|
|
||||||
__tmp.w = (__ElementType)(__value.w); \
|
|
||||||
asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \
|
|
||||||
__AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \
|
|
||||||
: "memory"); \
|
|
||||||
}
|
|
||||||
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r");
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r");
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h");
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r");
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l");
|
|
||||||
|
|
||||||
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r");
|
|
||||||
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r");
|
|
||||||
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h");
|
|
||||||
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h");
|
|
||||||
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r");
|
|
||||||
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r");
|
|
||||||
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l");
|
|
||||||
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r");
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short,
|
|
||||||
"h");
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r");
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long,
|
|
||||||
unsigned long long, "l");
|
|
||||||
|
|
||||||
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r");
|
|
||||||
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r");
|
|
||||||
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h");
|
|
||||||
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h");
|
|
||||||
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r");
|
|
||||||
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r");
|
|
||||||
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l");
|
|
||||||
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f");
|
|
||||||
__INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d");
|
|
||||||
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f");
|
|
||||||
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f");
|
|
||||||
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d");
|
|
||||||
|
|
||||||
#pragma pop_macro("__INTRINSIC_STORE")
|
|
||||||
#pragma pop_macro("__INTRINSIC_STORE2")
|
|
||||||
#pragma pop_macro("__INTRINSIC_STORE4")
|
|
||||||
|
|
||||||
#endif // defined(__cplusplus) && (__cplusplus >= 201103L)
|
|
||||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
|
|
||||||
|
|
||||||
#if CUDA_VERSION >= 11000
|
|
||||||
extern "C" {
|
|
||||||
__device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) {
|
|
||||||
return (size_t)(void __attribute__((address_space(1))) *)__ptr;
|
|
||||||
}
|
|
||||||
__device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) {
|
|
||||||
return (size_t)(void __attribute__((address_space(3))) *)__ptr;
|
|
||||||
}
|
|
||||||
__device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) {
|
|
||||||
return (size_t)(void __attribute__((address_space(4))) *)__ptr;
|
|
||||||
}
|
|
||||||
__device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) {
|
|
||||||
return (size_t)(void __attribute__((address_space(5))) *)__ptr;
|
|
||||||
}
|
|
||||||
__device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) {
|
|
||||||
return (void *)(void __attribute__((address_space(1))) *)__ptr;
|
|
||||||
}
|
|
||||||
__device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) {
|
|
||||||
return (void *)(void __attribute__((address_space(3))) *)__ptr;
|
|
||||||
}
|
|
||||||
__device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) {
|
|
||||||
return (void *)(void __attribute__((address_space(4))) *)__ptr;
|
|
||||||
}
|
|
||||||
__device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) {
|
|
||||||
return (void *)(void __attribute__((address_space(5))) *)__ptr;
|
|
||||||
}
|
|
||||||
__device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) {
|
|
||||||
return __nv_cvta_generic_to_shared_impl(__ptr);
|
|
||||||
}
|
|
||||||
} // extern "C"
|
|
||||||
|
|
||||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
|
|
||||||
__device__ inline unsigned __reduce_add_sync(unsigned __mask,
|
|
||||||
unsigned __value) {
|
|
||||||
return __nvvm_redux_sync_add(__value, __mask);
|
|
||||||
}
|
|
||||||
__device__ inline unsigned __reduce_min_sync(unsigned __mask,
|
|
||||||
unsigned __value) {
|
|
||||||
return __nvvm_redux_sync_umin(__value, __mask);
|
|
||||||
}
|
|
||||||
__device__ inline unsigned __reduce_max_sync(unsigned __mask,
|
|
||||||
unsigned __value) {
|
|
||||||
return __nvvm_redux_sync_umax(__value, __mask);
|
|
||||||
}
|
|
||||||
__device__ inline int __reduce_min_sync(unsigned __mask, int __value) {
|
|
||||||
return __nvvm_redux_sync_min(__value, __mask);
|
|
||||||
}
|
|
||||||
__device__ inline int __reduce_max_sync(unsigned __mask, int __value) {
|
|
||||||
return __nvvm_redux_sync_max(__value, __mask);
|
|
||||||
}
|
|
||||||
__device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) {
|
|
||||||
return __nvvm_redux_sync_or(__value, __mask);
|
|
||||||
}
|
|
||||||
__device__ inline unsigned __reduce_and_sync(unsigned __mask,
|
|
||||||
unsigned __value) {
|
|
||||||
return __nvvm_redux_sync_and(__value, __mask);
|
|
||||||
}
|
|
||||||
__device__ inline unsigned __reduce_xor_sync(unsigned __mask,
|
|
||||||
unsigned __value) {
|
|
||||||
return __nvvm_redux_sync_xor(__value, __mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline void __nv_memcpy_async_shared_global_4(void *__dst,
|
|
||||||
const void *__src,
|
|
||||||
unsigned __src_size) {
|
|
||||||
__nvvm_cp_async_ca_shared_global_4(
|
|
||||||
(void __attribute__((address_space(3))) *)__dst,
|
|
||||||
(const void __attribute__((address_space(1))) *)__src, __src_size);
|
|
||||||
}
|
|
||||||
__device__ inline void __nv_memcpy_async_shared_global_8(void *__dst,
|
|
||||||
const void *__src,
|
|
||||||
unsigned __src_size) {
|
|
||||||
__nvvm_cp_async_ca_shared_global_8(
|
|
||||||
(void __attribute__((address_space(3))) *)__dst,
|
|
||||||
(const void __attribute__((address_space(1))) *)__src, __src_size);
|
|
||||||
}
|
|
||||||
__device__ inline void __nv_memcpy_async_shared_global_16(void *__dst,
|
|
||||||
const void *__src,
|
|
||||||
unsigned __src_size) {
|
|
||||||
__nvvm_cp_async_ca_shared_global_16(
|
|
||||||
(void __attribute__((address_space(3))) *)__dst,
|
|
||||||
(const void __attribute__((address_space(1))) *)__src, __src_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline void *
|
|
||||||
__nv_associate_access_property(const void *__ptr, unsigned long long __prop) {
|
|
||||||
// TODO: it appears to provide compiler with some sort of a hint. We do not
|
|
||||||
// know what exactly it is supposed to do. However, CUDA headers suggest that
|
|
||||||
// just passing through __ptr should not affect correctness. They do so on
|
|
||||||
// pre-sm80 GPUs where this builtin is not available.
|
|
||||||
return (void*)__ptr;
|
|
||||||
}
|
|
||||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
|
|
||||||
|
|
||||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
|
|
||||||
__device__ inline unsigned __isCtaShared(const void *ptr) {
|
|
||||||
return __isShared(ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline unsigned __isClusterShared(const void *__ptr) {
|
|
||||||
return __nvvm_isspacep_shared_cluster(__ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline void *__cluster_map_shared_rank(const void *__ptr,
|
|
||||||
unsigned __rank) {
|
|
||||||
return __nvvm_mapa((void *)__ptr, __rank);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline unsigned __cluster_query_shared_rank(const void *__ptr) {
|
|
||||||
return __nvvm_getctarank((void *)__ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline uint2
|
|
||||||
__cluster_map_shared_multicast(const void *__ptr,
|
|
||||||
unsigned int __cluster_cta_mask) {
|
|
||||||
return make_uint2((unsigned)__cvta_generic_to_shared(__ptr),
|
|
||||||
__cluster_cta_mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline unsigned __clusterDimIsSpecified() {
|
|
||||||
return __nvvm_is_explicit_cluster();
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline dim3 __clusterDim() {
|
|
||||||
return dim3(__nvvm_read_ptx_sreg_cluster_nctaid_x(),
|
|
||||||
__nvvm_read_ptx_sreg_cluster_nctaid_y(),
|
|
||||||
__nvvm_read_ptx_sreg_cluster_nctaid_z());
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline dim3 __clusterRelativeBlockIdx() {
|
|
||||||
return dim3(__nvvm_read_ptx_sreg_cluster_ctaid_x(),
|
|
||||||
__nvvm_read_ptx_sreg_cluster_ctaid_y(),
|
|
||||||
__nvvm_read_ptx_sreg_cluster_ctaid_z());
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline dim3 __clusterGridDimInClusters() {
|
|
||||||
return dim3(__nvvm_read_ptx_sreg_nclusterid_x(),
|
|
||||||
__nvvm_read_ptx_sreg_nclusterid_y(),
|
|
||||||
__nvvm_read_ptx_sreg_nclusterid_z());
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline dim3 __clusterIdx() {
|
|
||||||
return dim3(__nvvm_read_ptx_sreg_clusterid_x(),
|
|
||||||
__nvvm_read_ptx_sreg_clusterid_y(),
|
|
||||||
__nvvm_read_ptx_sreg_clusterid_z());
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline unsigned __clusterRelativeBlockRank() {
|
|
||||||
return __nvvm_read_ptx_sreg_cluster_ctarank();
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline unsigned __clusterSizeInBlocks() {
|
|
||||||
return __nvvm_read_ptx_sreg_cluster_nctarank();
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline void __cluster_barrier_arrive() {
|
|
||||||
__nvvm_barrier_cluster_arrive();
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline void __cluster_barrier_arrive_relaxed() {
|
|
||||||
__nvvm_barrier_cluster_arrive_relaxed();
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline void __cluster_barrier_wait() {
|
|
||||||
__nvvm_barrier_cluster_wait();
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline void __threadfence_cluster() { __nvvm_fence_sc_cluster(); }
|
|
||||||
|
|
||||||
__device__ inline float2 atomicAdd(float2 *__ptr, float2 __val) {
|
|
||||||
float2 __ret;
|
|
||||||
__asm__("atom.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
|
|
||||||
: "=f"(__ret.x), "=f"(__ret.y)
|
|
||||||
: "l"(__ptr), "f"(__val.x), "f"(__val.y));
|
|
||||||
return __ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline float2 atomicAdd_block(float2 *__ptr, float2 __val) {
|
|
||||||
float2 __ret;
|
|
||||||
__asm__("atom.cta.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
|
|
||||||
: "=f"(__ret.x), "=f"(__ret.y)
|
|
||||||
: "l"(__ptr), "f"(__val.x), "f"(__val.y));
|
|
||||||
return __ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline float2 atomicAdd_system(float2 *__ptr, float2 __val) {
|
|
||||||
float2 __ret;
|
|
||||||
__asm__("atom.sys.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
|
|
||||||
: "=f"(__ret.x), "=f"(__ret.y)
|
|
||||||
: "l"(__ptr), "f"(__val.x), "f"(__val.y));
|
|
||||||
return __ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline float4 atomicAdd(float4 *__ptr, float4 __val) {
|
|
||||||
float4 __ret;
|
|
||||||
__asm__("atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
|
|
||||||
: "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
|
|
||||||
: "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w));
|
|
||||||
return __ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline float4 atomicAdd_block(float4 *__ptr, float4 __val) {
|
|
||||||
float4 __ret;
|
|
||||||
__asm__(
|
|
||||||
"atom.cta.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
|
|
||||||
: "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
|
|
||||||
: "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w));
|
|
||||||
return __ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline float4 atomicAdd_system(float4 *__ptr, float4 __val) {
|
|
||||||
float4 __ret;
|
|
||||||
__asm__(
|
|
||||||
"atom.sys.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
|
|
||||||
: "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
|
|
||||||
: "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w)
|
|
||||||
:);
|
|
||||||
return __ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
|
|
||||||
#endif // CUDA_VERSION >= 11000
|
|
||||||
|
|
||||||
#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
|
|
||||||
@@ -1,468 +0,0 @@
|
|||||||
/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
|
|
||||||
#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__OPENMP_NVPTX__)
|
|
||||||
#define __DEVICE__
|
|
||||||
#pragma omp begin assumes ext_spmd_amenable no_openmp
|
|
||||||
#elif defined(__CUDA__)
|
|
||||||
#define __DEVICE__ __device__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__DEVICE__ int __nv_abs(int __a);
|
|
||||||
__DEVICE__ double __nv_acos(double __a);
|
|
||||||
__DEVICE__ float __nv_acosf(float __a);
|
|
||||||
__DEVICE__ double __nv_acosh(double __a);
|
|
||||||
__DEVICE__ float __nv_acoshf(float __a);
|
|
||||||
__DEVICE__ double __nv_asin(double __a);
|
|
||||||
__DEVICE__ float __nv_asinf(float __a);
|
|
||||||
__DEVICE__ double __nv_asinh(double __a);
|
|
||||||
__DEVICE__ float __nv_asinhf(float __a);
|
|
||||||
__DEVICE__ double __nv_atan2(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_atan2f(float __a, float __b);
|
|
||||||
__DEVICE__ double __nv_atan(double __a);
|
|
||||||
__DEVICE__ float __nv_atanf(float __a);
|
|
||||||
__DEVICE__ double __nv_atanh(double __a);
|
|
||||||
__DEVICE__ float __nv_atanhf(float __a);
|
|
||||||
__DEVICE__ int __nv_brev(int __a);
|
|
||||||
__DEVICE__ long long __nv_brevll(long long __a);
|
|
||||||
__DEVICE__ int __nv_byte_perm(int __a, int __b, int __c);
|
|
||||||
__DEVICE__ double __nv_cbrt(double __a);
|
|
||||||
__DEVICE__ float __nv_cbrtf(float __a);
|
|
||||||
__DEVICE__ double __nv_ceil(double __a);
|
|
||||||
__DEVICE__ float __nv_ceilf(float __a);
|
|
||||||
__DEVICE__ int __nv_clz(int __a);
|
|
||||||
__DEVICE__ int __nv_clzll(long long __a);
|
|
||||||
__DEVICE__ double __nv_copysign(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_copysignf(float __a, float __b);
|
|
||||||
__DEVICE__ double __nv_cos(double __a);
|
|
||||||
__DEVICE__ float __nv_cosf(float __a);
|
|
||||||
__DEVICE__ double __nv_cosh(double __a);
|
|
||||||
__DEVICE__ float __nv_coshf(float __a);
|
|
||||||
__DEVICE__ double __nv_cospi(double __a);
|
|
||||||
__DEVICE__ float __nv_cospif(float __a);
|
|
||||||
__DEVICE__ double __nv_cyl_bessel_i0(double __a);
|
|
||||||
__DEVICE__ float __nv_cyl_bessel_i0f(float __a);
|
|
||||||
__DEVICE__ double __nv_cyl_bessel_i1(double __a);
|
|
||||||
__DEVICE__ float __nv_cyl_bessel_i1f(float __a);
|
|
||||||
__DEVICE__ double __nv_dadd_rd(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_dadd_rn(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_dadd_ru(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_dadd_rz(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_ddiv_rd(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_ddiv_rn(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_ddiv_ru(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_ddiv_rz(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_dmul_rd(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_dmul_rn(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_dmul_ru(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_dmul_rz(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_double2float_rd(double __a);
|
|
||||||
__DEVICE__ float __nv_double2float_rn(double __a);
|
|
||||||
__DEVICE__ float __nv_double2float_ru(double __a);
|
|
||||||
__DEVICE__ float __nv_double2float_rz(double __a);
|
|
||||||
__DEVICE__ int __nv_double2hiint(double __a);
|
|
||||||
__DEVICE__ int __nv_double2int_rd(double __a);
|
|
||||||
__DEVICE__ int __nv_double2int_rn(double __a);
|
|
||||||
__DEVICE__ int __nv_double2int_ru(double __a);
|
|
||||||
__DEVICE__ int __nv_double2int_rz(double __a);
|
|
||||||
__DEVICE__ long long __nv_double2ll_rd(double __a);
|
|
||||||
__DEVICE__ long long __nv_double2ll_rn(double __a);
|
|
||||||
__DEVICE__ long long __nv_double2ll_ru(double __a);
|
|
||||||
__DEVICE__ long long __nv_double2ll_rz(double __a);
|
|
||||||
__DEVICE__ int __nv_double2loint(double __a);
|
|
||||||
__DEVICE__ unsigned int __nv_double2uint_rd(double __a);
|
|
||||||
__DEVICE__ unsigned int __nv_double2uint_rn(double __a);
|
|
||||||
__DEVICE__ unsigned int __nv_double2uint_ru(double __a);
|
|
||||||
__DEVICE__ unsigned int __nv_double2uint_rz(double __a);
|
|
||||||
__DEVICE__ unsigned long long __nv_double2ull_rd(double __a);
|
|
||||||
__DEVICE__ unsigned long long __nv_double2ull_rn(double __a);
|
|
||||||
__DEVICE__ unsigned long long __nv_double2ull_ru(double __a);
|
|
||||||
__DEVICE__ unsigned long long __nv_double2ull_rz(double __a);
|
|
||||||
__DEVICE__ unsigned long long __nv_double_as_longlong(double __a);
|
|
||||||
__DEVICE__ double __nv_drcp_rd(double __a);
|
|
||||||
__DEVICE__ double __nv_drcp_rn(double __a);
|
|
||||||
__DEVICE__ double __nv_drcp_ru(double __a);
|
|
||||||
__DEVICE__ double __nv_drcp_rz(double __a);
|
|
||||||
__DEVICE__ double __nv_dsqrt_rd(double __a);
|
|
||||||
__DEVICE__ double __nv_dsqrt_rn(double __a);
|
|
||||||
__DEVICE__ double __nv_dsqrt_ru(double __a);
|
|
||||||
__DEVICE__ double __nv_dsqrt_rz(double __a);
|
|
||||||
__DEVICE__ double __nv_dsub_rd(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_dsub_rn(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_dsub_ru(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_dsub_rz(double __a, double __b);
|
|
||||||
__DEVICE__ double __nv_erfc(double __a);
|
|
||||||
__DEVICE__ float __nv_erfcf(float __a);
|
|
||||||
__DEVICE__ double __nv_erfcinv(double __a);
|
|
||||||
__DEVICE__ float __nv_erfcinvf(float __a);
|
|
||||||
__DEVICE__ double __nv_erfcx(double __a);
|
|
||||||
__DEVICE__ float __nv_erfcxf(float __a);
|
|
||||||
__DEVICE__ double __nv_erf(double __a);
|
|
||||||
__DEVICE__ float __nv_erff(float __a);
|
|
||||||
__DEVICE__ double __nv_erfinv(double __a);
|
|
||||||
__DEVICE__ float __nv_erfinvf(float __a);
|
|
||||||
__DEVICE__ double __nv_exp10(double __a);
|
|
||||||
__DEVICE__ float __nv_exp10f(float __a);
|
|
||||||
__DEVICE__ double __nv_exp2(double __a);
|
|
||||||
__DEVICE__ float __nv_exp2f(float __a);
|
|
||||||
__DEVICE__ double __nv_exp(double __a);
|
|
||||||
__DEVICE__ float __nv_expf(float __a);
|
|
||||||
__DEVICE__ double __nv_expm1(double __a);
|
|
||||||
__DEVICE__ float __nv_expm1f(float __a);
|
|
||||||
__DEVICE__ double __nv_fabs(double __a);
|
|
||||||
__DEVICE__ float __nv_fabsf(float __a);
|
|
||||||
__DEVICE__ float __nv_fadd_rd(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fadd_rn(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fadd_ru(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fadd_rz(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fast_cosf(float __a);
|
|
||||||
__DEVICE__ float __nv_fast_exp10f(float __a);
|
|
||||||
__DEVICE__ float __nv_fast_expf(float __a);
|
|
||||||
__DEVICE__ float __nv_fast_fdividef(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fast_log10f(float __a);
|
|
||||||
__DEVICE__ float __nv_fast_log2f(float __a);
|
|
||||||
__DEVICE__ float __nv_fast_logf(float __a);
|
|
||||||
__DEVICE__ float __nv_fast_powf(float __a, float __b);
|
|
||||||
__DEVICE__ void __nv_fast_sincosf(float __a, float *__s, float *__c);
|
|
||||||
__DEVICE__ float __nv_fast_sinf(float __a);
|
|
||||||
__DEVICE__ float __nv_fast_tanf(float __a);
|
|
||||||
__DEVICE__ double __nv_fdim(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_fdimf(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fdiv_rd(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fdiv_rn(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fdiv_ru(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fdiv_rz(float __a, float __b);
|
|
||||||
__DEVICE__ int __nv_ffs(int __a);
|
|
||||||
__DEVICE__ int __nv_ffsll(long long __a);
|
|
||||||
__DEVICE__ int __nv_finitef(float __a);
|
|
||||||
__DEVICE__ unsigned short __nv_float2half_rn(float __a);
|
|
||||||
__DEVICE__ int __nv_float2int_rd(float __a);
|
|
||||||
__DEVICE__ int __nv_float2int_rn(float __a);
|
|
||||||
__DEVICE__ int __nv_float2int_ru(float __a);
|
|
||||||
__DEVICE__ int __nv_float2int_rz(float __a);
|
|
||||||
__DEVICE__ long long __nv_float2ll_rd(float __a);
|
|
||||||
__DEVICE__ long long __nv_float2ll_rn(float __a);
|
|
||||||
__DEVICE__ long long __nv_float2ll_ru(float __a);
|
|
||||||
__DEVICE__ long long __nv_float2ll_rz(float __a);
|
|
||||||
__DEVICE__ unsigned int __nv_float2uint_rd(float __a);
|
|
||||||
__DEVICE__ unsigned int __nv_float2uint_rn(float __a);
|
|
||||||
__DEVICE__ unsigned int __nv_float2uint_ru(float __a);
|
|
||||||
__DEVICE__ unsigned int __nv_float2uint_rz(float __a);
|
|
||||||
__DEVICE__ unsigned long long __nv_float2ull_rd(float __a);
|
|
||||||
__DEVICE__ unsigned long long __nv_float2ull_rn(float __a);
|
|
||||||
__DEVICE__ unsigned long long __nv_float2ull_ru(float __a);
|
|
||||||
__DEVICE__ unsigned long long __nv_float2ull_rz(float __a);
|
|
||||||
__DEVICE__ int __nv_float_as_int(float __a);
|
|
||||||
__DEVICE__ unsigned int __nv_float_as_uint(float __a);
|
|
||||||
__DEVICE__ double __nv_floor(double __a);
|
|
||||||
__DEVICE__ float __nv_floorf(float __a);
|
|
||||||
__DEVICE__ double __nv_fma(double __a, double __b, double __c);
|
|
||||||
__DEVICE__ float __nv_fmaf(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ float __nv_fmaf_rd(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ float __nv_fmaf_rn(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ float __nv_fmaf_ru(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ float __nv_fmaf_rz(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ double __nv_fma_rd(double __a, double __b, double __c);
|
|
||||||
__DEVICE__ double __nv_fma_rn(double __a, double __b, double __c);
|
|
||||||
__DEVICE__ double __nv_fma_ru(double __a, double __b, double __c);
|
|
||||||
__DEVICE__ double __nv_fma_rz(double __a, double __b, double __c);
|
|
||||||
__DEVICE__ double __nv_fmax(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_fmaxf(float __a, float __b);
|
|
||||||
__DEVICE__ double __nv_fmin(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_fminf(float __a, float __b);
|
|
||||||
__DEVICE__ double __nv_fmod(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_fmodf(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fmul_rd(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fmul_rn(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fmul_ru(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fmul_rz(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_frcp_rd(float __a);
|
|
||||||
__DEVICE__ float __nv_frcp_rn(float __a);
|
|
||||||
__DEVICE__ float __nv_frcp_ru(float __a);
|
|
||||||
__DEVICE__ float __nv_frcp_rz(float __a);
|
|
||||||
__DEVICE__ double __nv_frexp(double __a, int *__b);
|
|
||||||
__DEVICE__ float __nv_frexpf(float __a, int *__b);
|
|
||||||
__DEVICE__ float __nv_frsqrt_rn(float __a);
|
|
||||||
__DEVICE__ float __nv_fsqrt_rd(float __a);
|
|
||||||
__DEVICE__ float __nv_fsqrt_rn(float __a);
|
|
||||||
__DEVICE__ float __nv_fsqrt_ru(float __a);
|
|
||||||
__DEVICE__ float __nv_fsqrt_rz(float __a);
|
|
||||||
__DEVICE__ float __nv_fsub_rd(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fsub_rn(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fsub_ru(float __a, float __b);
|
|
||||||
__DEVICE__ float __nv_fsub_rz(float __a, float __b);
|
|
||||||
__DEVICE__ int __nv_hadd(int __a, int __b);
|
|
||||||
__DEVICE__ float __nv_half2float(unsigned short __h);
|
|
||||||
__DEVICE__ double __nv_hiloint2double(int __a, int __b);
|
|
||||||
__DEVICE__ double __nv_hypot(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_hypotf(float __a, float __b);
|
|
||||||
__DEVICE__ int __nv_ilogb(double __a);
|
|
||||||
__DEVICE__ int __nv_ilogbf(float __a);
|
|
||||||
__DEVICE__ double __nv_int2double_rn(int __a);
|
|
||||||
__DEVICE__ float __nv_int2float_rd(int __a);
|
|
||||||
__DEVICE__ float __nv_int2float_rn(int __a);
|
|
||||||
__DEVICE__ float __nv_int2float_ru(int __a);
|
|
||||||
__DEVICE__ float __nv_int2float_rz(int __a);
|
|
||||||
__DEVICE__ float __nv_int_as_float(int __a);
|
|
||||||
__DEVICE__ int __nv_isfinited(double __a);
|
|
||||||
__DEVICE__ int __nv_isinfd(double __a);
|
|
||||||
__DEVICE__ int __nv_isinff(float __a);
|
|
||||||
__DEVICE__ int __nv_isnand(double __a);
|
|
||||||
__DEVICE__ int __nv_isnanf(float __a);
|
|
||||||
__DEVICE__ double __nv_j0(double __a);
|
|
||||||
__DEVICE__ float __nv_j0f(float __a);
|
|
||||||
__DEVICE__ double __nv_j1(double __a);
|
|
||||||
__DEVICE__ float __nv_j1f(float __a);
|
|
||||||
__DEVICE__ float __nv_jnf(int __a, float __b);
|
|
||||||
__DEVICE__ double __nv_jn(int __a, double __b);
|
|
||||||
__DEVICE__ double __nv_ldexp(double __a, int __b);
|
|
||||||
__DEVICE__ float __nv_ldexpf(float __a, int __b);
|
|
||||||
__DEVICE__ double __nv_lgamma(double __a);
|
|
||||||
__DEVICE__ float __nv_lgammaf(float __a);
|
|
||||||
__DEVICE__ double __nv_ll2double_rd(long long __a);
|
|
||||||
__DEVICE__ double __nv_ll2double_rn(long long __a);
|
|
||||||
__DEVICE__ double __nv_ll2double_ru(long long __a);
|
|
||||||
__DEVICE__ double __nv_ll2double_rz(long long __a);
|
|
||||||
__DEVICE__ float __nv_ll2float_rd(long long __a);
|
|
||||||
__DEVICE__ float __nv_ll2float_rn(long long __a);
|
|
||||||
__DEVICE__ float __nv_ll2float_ru(long long __a);
|
|
||||||
__DEVICE__ float __nv_ll2float_rz(long long __a);
|
|
||||||
__DEVICE__ long long __nv_llabs(long long __a);
|
|
||||||
__DEVICE__ long long __nv_llmax(long long __a, long long __b);
|
|
||||||
__DEVICE__ long long __nv_llmin(long long __a, long long __b);
|
|
||||||
__DEVICE__ long long __nv_llrint(double __a);
|
|
||||||
__DEVICE__ long long __nv_llrintf(float __a);
|
|
||||||
__DEVICE__ long long __nv_llround(double __a);
|
|
||||||
__DEVICE__ long long __nv_llroundf(float __a);
|
|
||||||
__DEVICE__ double __nv_log10(double __a);
|
|
||||||
__DEVICE__ float __nv_log10f(float __a);
|
|
||||||
__DEVICE__ double __nv_log1p(double __a);
|
|
||||||
__DEVICE__ float __nv_log1pf(float __a);
|
|
||||||
__DEVICE__ double __nv_log2(double __a);
|
|
||||||
__DEVICE__ float __nv_log2f(float __a);
|
|
||||||
__DEVICE__ double __nv_logb(double __a);
|
|
||||||
__DEVICE__ float __nv_logbf(float __a);
|
|
||||||
__DEVICE__ double __nv_log(double __a);
|
|
||||||
__DEVICE__ float __nv_logf(float __a);
|
|
||||||
__DEVICE__ double __nv_longlong_as_double(long long __a);
|
|
||||||
__DEVICE__ int __nv_max(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_min(int __a, int __b);
|
|
||||||
__DEVICE__ double __nv_modf(double __a, double *__b);
|
|
||||||
__DEVICE__ float __nv_modff(float __a, float *__b);
|
|
||||||
__DEVICE__ int __nv_mul24(int __a, int __b);
|
|
||||||
__DEVICE__ long long __nv_mul64hi(long long __a, long long __b);
|
|
||||||
__DEVICE__ int __nv_mulhi(int __a, int __b);
|
|
||||||
__DEVICE__ double __nv_nan(const signed char *__a);
|
|
||||||
__DEVICE__ float __nv_nanf(const signed char *__a);
|
|
||||||
__DEVICE__ double __nv_nearbyint(double __a);
|
|
||||||
__DEVICE__ float __nv_nearbyintf(float __a);
|
|
||||||
__DEVICE__ double __nv_nextafter(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_nextafterf(float __a, float __b);
|
|
||||||
__DEVICE__ double __nv_norm3d(double __a, double __b, double __c);
|
|
||||||
__DEVICE__ float __nv_norm3df(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ double __nv_norm4d(double __a, double __b, double __c, double __d);
|
|
||||||
__DEVICE__ float __nv_norm4df(float __a, float __b, float __c, float __d);
|
|
||||||
__DEVICE__ double __nv_normcdf(double __a);
|
|
||||||
__DEVICE__ float __nv_normcdff(float __a);
|
|
||||||
__DEVICE__ double __nv_normcdfinv(double __a);
|
|
||||||
__DEVICE__ float __nv_normcdfinvf(float __a);
|
|
||||||
__DEVICE__ float __nv_normf(int __a, const float *__b);
|
|
||||||
__DEVICE__ double __nv_norm(int __a, const double *__b);
|
|
||||||
__DEVICE__ int __nv_popc(unsigned int __a);
|
|
||||||
__DEVICE__ int __nv_popcll(unsigned long long __a);
|
|
||||||
__DEVICE__ double __nv_pow(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_powf(float __a, float __b);
|
|
||||||
__DEVICE__ double __nv_powi(double __a, int __b);
|
|
||||||
__DEVICE__ float __nv_powif(float __a, int __b);
|
|
||||||
__DEVICE__ double __nv_rcbrt(double __a);
|
|
||||||
__DEVICE__ float __nv_rcbrtf(float __a);
|
|
||||||
__DEVICE__ double __nv_rcp64h(double __a);
|
|
||||||
__DEVICE__ double __nv_remainder(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_remainderf(float __a, float __b);
|
|
||||||
__DEVICE__ double __nv_remquo(double __a, double __b, int *__c);
|
|
||||||
__DEVICE__ float __nv_remquof(float __a, float __b, int *__c);
|
|
||||||
__DEVICE__ int __nv_rhadd(int __a, int __b);
|
|
||||||
__DEVICE__ double __nv_rhypot(double __a, double __b);
|
|
||||||
__DEVICE__ float __nv_rhypotf(float __a, float __b);
|
|
||||||
__DEVICE__ double __nv_rint(double __a);
|
|
||||||
__DEVICE__ float __nv_rintf(float __a);
|
|
||||||
__DEVICE__ double __nv_rnorm3d(double __a, double __b, double __c);
|
|
||||||
__DEVICE__ float __nv_rnorm3df(float __a, float __b, float __c);
|
|
||||||
__DEVICE__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
|
|
||||||
__DEVICE__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
|
|
||||||
__DEVICE__ float __nv_rnormf(int __a, const float *__b);
|
|
||||||
__DEVICE__ double __nv_rnorm(int __a, const double *__b);
|
|
||||||
__DEVICE__ double __nv_round(double __a);
|
|
||||||
__DEVICE__ float __nv_roundf(float __a);
|
|
||||||
__DEVICE__ double __nv_rsqrt(double __a);
|
|
||||||
__DEVICE__ float __nv_rsqrtf(float __a);
|
|
||||||
__DEVICE__ int __nv_sad(int __a, int __b, int __c);
|
|
||||||
__DEVICE__ float __nv_saturatef(float __a);
|
|
||||||
__DEVICE__ double __nv_scalbn(double __a, int __b);
|
|
||||||
__DEVICE__ float __nv_scalbnf(float __a, int __b);
|
|
||||||
__DEVICE__ int __nv_signbitd(double __a);
|
|
||||||
__DEVICE__ int __nv_signbitf(float __a);
|
|
||||||
__DEVICE__ void __nv_sincos(double __a, double *__b, double *__c);
|
|
||||||
__DEVICE__ void __nv_sincosf(float __a, float *__b, float *__c);
|
|
||||||
__DEVICE__ void __nv_sincospi(double __a, double *__b, double *__c);
|
|
||||||
__DEVICE__ void __nv_sincospif(float __a, float *__b, float *__c);
|
|
||||||
__DEVICE__ double __nv_sin(double __a);
|
|
||||||
__DEVICE__ float __nv_sinf(float __a);
|
|
||||||
__DEVICE__ double __nv_sinh(double __a);
|
|
||||||
__DEVICE__ float __nv_sinhf(float __a);
|
|
||||||
__DEVICE__ double __nv_sinpi(double __a);
|
|
||||||
__DEVICE__ float __nv_sinpif(float __a);
|
|
||||||
__DEVICE__ double __nv_sqrt(double __a);
|
|
||||||
__DEVICE__ float __nv_sqrtf(float __a);
|
|
||||||
__DEVICE__ double __nv_tan(double __a);
|
|
||||||
__DEVICE__ float __nv_tanf(float __a);
|
|
||||||
__DEVICE__ double __nv_tanh(double __a);
|
|
||||||
__DEVICE__ float __nv_tanhf(float __a);
|
|
||||||
__DEVICE__ double __nv_tgamma(double __a);
|
|
||||||
__DEVICE__ float __nv_tgammaf(float __a);
|
|
||||||
__DEVICE__ double __nv_trunc(double __a);
|
|
||||||
__DEVICE__ float __nv_truncf(float __a);
|
|
||||||
__DEVICE__ int __nv_uhadd(unsigned int __a, unsigned int __b);
|
|
||||||
__DEVICE__ double __nv_uint2double_rn(unsigned int __i);
|
|
||||||
__DEVICE__ float __nv_uint2float_rd(unsigned int __a);
|
|
||||||
__DEVICE__ float __nv_uint2float_rn(unsigned int __a);
|
|
||||||
__DEVICE__ float __nv_uint2float_ru(unsigned int __a);
|
|
||||||
__DEVICE__ float __nv_uint2float_rz(unsigned int __a);
|
|
||||||
__DEVICE__ float __nv_uint_as_float(unsigned int __a);
|
|
||||||
__DEVICE__ double __nv_ull2double_rd(unsigned long long __a);
|
|
||||||
__DEVICE__ double __nv_ull2double_rn(unsigned long long __a);
|
|
||||||
__DEVICE__ double __nv_ull2double_ru(unsigned long long __a);
|
|
||||||
__DEVICE__ double __nv_ull2double_rz(unsigned long long __a);
|
|
||||||
__DEVICE__ float __nv_ull2float_rd(unsigned long long __a);
|
|
||||||
__DEVICE__ float __nv_ull2float_rn(unsigned long long __a);
|
|
||||||
__DEVICE__ float __nv_ull2float_ru(unsigned long long __a);
|
|
||||||
__DEVICE__ float __nv_ull2float_rz(unsigned long long __a);
|
|
||||||
__DEVICE__ unsigned long long __nv_ullmax(unsigned long long __a,
|
|
||||||
unsigned long long __b);
|
|
||||||
__DEVICE__ unsigned long long __nv_ullmin(unsigned long long __a,
|
|
||||||
unsigned long long __b);
|
|
||||||
__DEVICE__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
|
|
||||||
__DEVICE__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
|
|
||||||
__DEVICE__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
|
|
||||||
__DEVICE__ unsigned long long __nv_umul64hi(unsigned long long __a,
|
|
||||||
unsigned long long __b);
|
|
||||||
__DEVICE__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
|
|
||||||
__DEVICE__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
|
|
||||||
__DEVICE__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
|
|
||||||
unsigned int __c);
|
|
||||||
#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
|
|
||||||
__DEVICE__ int __nv_vabs2(int __a);
|
|
||||||
__DEVICE__ int __nv_vabs4(int __a);
|
|
||||||
__DEVICE__ int __nv_vabsdiffs2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vabsdiffs4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vabsdiffu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vabsdiffu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vabsss2(int __a);
|
|
||||||
__DEVICE__ int __nv_vabsss4(int __a);
|
|
||||||
__DEVICE__ int __nv_vadd2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vadd4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vaddss2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vaddss4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vaddus2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vaddus4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vavgs2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vavgs4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vavgu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vavgu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpeq2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpeq4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpges2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpges4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpgeu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpgeu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpgts2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpgts4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpgtu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpgtu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmples2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmples4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpleu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpleu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmplts2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmplts4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpltu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpltu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpne2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vcmpne4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vhaddu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vhaddu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vmaxs2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vmaxs4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vmaxu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vmaxu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vmins2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vmins4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vminu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vminu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vneg2(int __a);
|
|
||||||
__DEVICE__ int __nv_vneg4(int __a);
|
|
||||||
__DEVICE__ int __nv_vnegss2(int __a);
|
|
||||||
__DEVICE__ int __nv_vnegss4(int __a);
|
|
||||||
__DEVICE__ int __nv_vsads2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsads4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsadu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsadu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vseteq2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vseteq4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetges2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetges4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetgeu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetgeu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetgts2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetgts4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetgtu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetgtu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetles2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetles4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetleu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetleu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetlts2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetlts4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetltu2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetltu4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetne2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsetne4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsub2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsub4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsubss2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsubss4(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsubus2(int __a, int __b);
|
|
||||||
__DEVICE__ int __nv_vsubus4(int __a, int __b);
|
|
||||||
#endif // CUDA_VERSION
|
|
||||||
__DEVICE__ double __nv_y0(double __a);
|
|
||||||
__DEVICE__ float __nv_y0f(float __a);
|
|
||||||
__DEVICE__ double __nv_y1(double __a);
|
|
||||||
__DEVICE__ float __nv_y1f(float __a);
|
|
||||||
__DEVICE__ float __nv_ynf(int __a, float __b);
|
|
||||||
__DEVICE__ double __nv_yn(int __a, double __b);
|
|
||||||
|
|
||||||
#if defined(__OPENMP_NVPTX__)
|
|
||||||
#pragma omp end assumes ext_spmd_amenable no_openmp
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
} // extern "C"
|
|
||||||
#endif
|
|
||||||
#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__
|
|
||||||
@@ -1,353 +0,0 @@
|
|||||||
/*===---- __clang_cuda_math.h - Device-side CUDA math support --------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __CLANG_CUDA_MATH_H__
|
|
||||||
#define __CLANG_CUDA_MATH_H__
|
|
||||||
#ifndef __CUDA__
|
|
||||||
#error "This file is for CUDA compilation only."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// The __CLANG_GPU_DISABLE_MATH_WRAPPERS macro provides a way to let standard
|
|
||||||
// libcalls reach the link step instead of being eagerly replaced.
|
|
||||||
#ifndef __CLANG_GPU_DISABLE_MATH_WRAPPERS
|
|
||||||
|
|
||||||
#ifndef __OPENMP_NVPTX__
|
|
||||||
#if CUDA_VERSION < 9000
|
|
||||||
#error This file is intended to be used with CUDA-9+ only.
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// __DEVICE__ is a helper macro with common set of attributes for the wrappers
|
|
||||||
// we implement in this file. We need static in order to avoid emitting unused
|
|
||||||
// functions and __forceinline__ helps inlining these wrappers at -O1.
|
|
||||||
#pragma push_macro("__DEVICE__")
|
|
||||||
#ifdef __OPENMP_NVPTX__
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
|
|
||||||
#else
|
|
||||||
#define __DEVICE__ static __attribute__((always_inline, nothrow))
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define __DEVICE__ static __device__ __forceinline__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Specialized version of __DEVICE__ for functions with void return type. Needed
|
|
||||||
// because the OpenMP overlay requires constexpr functions here but prior to
|
|
||||||
// c++14 void return functions could not be constexpr.
|
|
||||||
#pragma push_macro("__DEVICE_VOID__")
|
|
||||||
#if defined(__OPENMP_NVPTX__) && defined(__cplusplus) && __cplusplus < 201402L
|
|
||||||
#define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
|
|
||||||
#else
|
|
||||||
#define __DEVICE_VOID__ __DEVICE__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// libdevice provides fast low precision and slow full-recision implementations
|
|
||||||
// for some functions. Which one gets selected depends on
|
|
||||||
// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
|
|
||||||
// -ffast-math or -fgpu-approx-transcendentals are in effect.
|
|
||||||
#pragma push_macro("__FAST_OR_SLOW")
|
|
||||||
#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
|
|
||||||
#define __FAST_OR_SLOW(fast, slow) fast
|
|
||||||
#else
|
|
||||||
#define __FAST_OR_SLOW(fast, slow) slow
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
|
|
||||||
__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
|
|
||||||
__DEVICE__ double acos(double __a) { return __nv_acos(__a); }
|
|
||||||
__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
|
|
||||||
__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
|
|
||||||
__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); }
|
|
||||||
__DEVICE__ double asin(double __a) { return __nv_asin(__a); }
|
|
||||||
__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); }
|
|
||||||
__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); }
|
|
||||||
__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); }
|
|
||||||
__DEVICE__ double atan(double __a) { return __nv_atan(__a); }
|
|
||||||
__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); }
|
|
||||||
__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); }
|
|
||||||
__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); }
|
|
||||||
__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); }
|
|
||||||
__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); }
|
|
||||||
__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
|
|
||||||
__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
|
|
||||||
__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
|
|
||||||
__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
|
|
||||||
__DEVICE__ double copysign(double __a, double __b) {
|
|
||||||
return __nv_copysign(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ float copysignf(float __a, float __b) {
|
|
||||||
return __nv_copysignf(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ double cos(double __a) { return __nv_cos(__a); }
|
|
||||||
__DEVICE__ float cosf(float __a) {
|
|
||||||
return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a);
|
|
||||||
}
|
|
||||||
__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); }
|
|
||||||
__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); }
|
|
||||||
__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); }
|
|
||||||
__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); }
|
|
||||||
__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); }
|
|
||||||
__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); }
|
|
||||||
__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); }
|
|
||||||
__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); }
|
|
||||||
__DEVICE__ double erf(double __a) { return __nv_erf(__a); }
|
|
||||||
__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); }
|
|
||||||
__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); }
|
|
||||||
__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); }
|
|
||||||
__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); }
|
|
||||||
__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); }
|
|
||||||
__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); }
|
|
||||||
__DEVICE__ float erff(float __a) { return __nv_erff(__a); }
|
|
||||||
__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); }
|
|
||||||
__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); }
|
|
||||||
__DEVICE__ double exp(double __a) { return __nv_exp(__a); }
|
|
||||||
__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); }
|
|
||||||
__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); }
|
|
||||||
__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); }
|
|
||||||
__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
|
|
||||||
__DEVICE__ float expf(float __a) { return __nv_expf(__a); }
|
|
||||||
__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
|
|
||||||
__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
|
|
||||||
__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
|
|
||||||
__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
|
|
||||||
__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
|
|
||||||
__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
|
|
||||||
__DEVICE__ float fdividef(float __a, float __b) {
|
|
||||||
#if __FAST_MATH__ && !__CUDA_PREC_DIV
|
|
||||||
return __nv_fast_fdividef(__a, __b);
|
|
||||||
#else
|
|
||||||
return __a / __b;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
__DEVICE__ double floor(double __f) { return __nv_floor(__f); }
|
|
||||||
__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); }
|
|
||||||
__DEVICE__ double fma(double __a, double __b, double __c) {
|
|
||||||
return __nv_fma(__a, __b, __c);
|
|
||||||
}
|
|
||||||
__DEVICE__ float fmaf(float __a, float __b, float __c) {
|
|
||||||
return __nv_fmaf(__a, __b, __c);
|
|
||||||
}
|
|
||||||
__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); }
|
|
||||||
__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); }
|
|
||||||
__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); }
|
|
||||||
__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); }
|
|
||||||
__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); }
|
|
||||||
__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); }
|
|
||||||
__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); }
|
|
||||||
__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); }
|
|
||||||
__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); }
|
|
||||||
__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); }
|
|
||||||
__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); }
|
|
||||||
__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); }
|
|
||||||
__DEVICE__ double j0(double __a) { return __nv_j0(__a); }
|
|
||||||
__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); }
|
|
||||||
__DEVICE__ double j1(double __a) { return __nv_j1(__a); }
|
|
||||||
__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
|
|
||||||
__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
|
|
||||||
__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
|
|
||||||
#if defined(__LP64__) || defined(_WIN64)
|
|
||||||
__DEVICE__ long labs(long __a) { return __nv_llabs(__a); };
|
|
||||||
#else
|
|
||||||
__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
|
|
||||||
#endif
|
|
||||||
__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
|
|
||||||
__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
|
|
||||||
__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
|
|
||||||
__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
|
|
||||||
__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); }
|
|
||||||
__DEVICE__ long long llmax(long long __a, long long __b) {
|
|
||||||
return __nv_llmax(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ long long llmin(long long __a, long long __b) {
|
|
||||||
return __nv_llmin(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
|
|
||||||
__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
|
|
||||||
__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
|
|
||||||
__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
|
|
||||||
__DEVICE__ double round(double __a) { return __nv_round(__a); }
|
|
||||||
__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
|
|
||||||
__DEVICE__ double log(double __a) { return __nv_log(__a); }
|
|
||||||
__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
|
|
||||||
__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
|
|
||||||
__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); }
|
|
||||||
__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); }
|
|
||||||
__DEVICE__ double log2(double __a) { return __nv_log2(__a); }
|
|
||||||
__DEVICE__ float log2f(float __a) {
|
|
||||||
return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a);
|
|
||||||
}
|
|
||||||
__DEVICE__ double logb(double __a) { return __nv_logb(__a); }
|
|
||||||
__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
|
|
||||||
__DEVICE__ float logf(float __a) {
|
|
||||||
return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
|
|
||||||
}
|
|
||||||
#if defined(__LP64__) || defined(_WIN64)
|
|
||||||
__DEVICE__ long lrint(double __a) { return llrint(__a); }
|
|
||||||
__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
|
|
||||||
__DEVICE__ long lround(double __a) { return llround(__a); }
|
|
||||||
__DEVICE__ long lroundf(float __a) { return llroundf(__a); }
|
|
||||||
#else
|
|
||||||
__DEVICE__ long lrint(double __a) { return (long)rint(__a); }
|
|
||||||
__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); }
|
|
||||||
__DEVICE__ long lround(double __a) { return round(__a); }
|
|
||||||
__DEVICE__ long lroundf(float __a) { return roundf(__a); }
|
|
||||||
#endif
|
|
||||||
__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
|
|
||||||
__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
|
|
||||||
__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
|
|
||||||
__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
|
|
||||||
__DEVICE__ double nearbyint(double __a) { return __builtin_nearbyint(__a); }
|
|
||||||
__DEVICE__ float nearbyintf(float __a) { return __builtin_nearbyintf(__a); }
|
|
||||||
__DEVICE__ double nextafter(double __a, double __b) {
|
|
||||||
return __nv_nextafter(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ float nextafterf(float __a, float __b) {
|
|
||||||
return __nv_nextafterf(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ double norm(int __dim, const double *__t) {
|
|
||||||
return __nv_norm(__dim, __t);
|
|
||||||
}
|
|
||||||
__DEVICE__ double norm3d(double __a, double __b, double __c) {
|
|
||||||
return __nv_norm3d(__a, __b, __c);
|
|
||||||
}
|
|
||||||
__DEVICE__ float norm3df(float __a, float __b, float __c) {
|
|
||||||
return __nv_norm3df(__a, __b, __c);
|
|
||||||
}
|
|
||||||
__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) {
|
|
||||||
return __nv_norm4d(__a, __b, __c, __d);
|
|
||||||
}
|
|
||||||
__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) {
|
|
||||||
return __nv_norm4df(__a, __b, __c, __d);
|
|
||||||
}
|
|
||||||
__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); }
|
|
||||||
__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); }
|
|
||||||
__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); }
|
|
||||||
__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); }
|
|
||||||
__DEVICE__ float normf(int __dim, const float *__t) {
|
|
||||||
return __nv_normf(__dim, __t);
|
|
||||||
}
|
|
||||||
__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); }
|
|
||||||
__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); }
|
|
||||||
__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); }
|
|
||||||
__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); }
|
|
||||||
__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); }
|
|
||||||
__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); }
|
|
||||||
__DEVICE__ double remainder(double __a, double __b) {
|
|
||||||
return __nv_remainder(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ float remainderf(float __a, float __b) {
|
|
||||||
return __nv_remainderf(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ double remquo(double __a, double __b, int *__c) {
|
|
||||||
return __nv_remquo(__a, __b, __c);
|
|
||||||
}
|
|
||||||
__DEVICE__ float remquof(float __a, float __b, int *__c) {
|
|
||||||
return __nv_remquof(__a, __b, __c);
|
|
||||||
}
|
|
||||||
__DEVICE__ double rhypot(double __a, double __b) {
|
|
||||||
return __nv_rhypot(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ float rhypotf(float __a, float __b) {
|
|
||||||
return __nv_rhypotf(__a, __b);
|
|
||||||
}
|
|
||||||
// __nv_rint* in libdevice is buggy and produces incorrect results.
|
|
||||||
__DEVICE__ double rint(double __a) { return __builtin_rint(__a); }
|
|
||||||
__DEVICE__ float rintf(float __a) { return __builtin_rintf(__a); }
|
|
||||||
__DEVICE__ double rnorm(int __a, const double *__b) {
|
|
||||||
return __nv_rnorm(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ double rnorm3d(double __a, double __b, double __c) {
|
|
||||||
return __nv_rnorm3d(__a, __b, __c);
|
|
||||||
}
|
|
||||||
__DEVICE__ float rnorm3df(float __a, float __b, float __c) {
|
|
||||||
return __nv_rnorm3df(__a, __b, __c);
|
|
||||||
}
|
|
||||||
__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) {
|
|
||||||
return __nv_rnorm4d(__a, __b, __c, __d);
|
|
||||||
}
|
|
||||||
__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
|
|
||||||
return __nv_rnorm4df(__a, __b, __c, __d);
|
|
||||||
}
|
|
||||||
__DEVICE__ float rnormf(int __dim, const float *__t) {
|
|
||||||
return __nv_rnormf(__dim, __t);
|
|
||||||
}
|
|
||||||
__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
|
|
||||||
__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
|
|
||||||
__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
|
|
||||||
__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
|
|
||||||
__DEVICE__ double scalbln(double __a, long __b) {
|
|
||||||
if (__b > INT_MAX)
|
|
||||||
return __a > 0 ? HUGE_VAL : -HUGE_VAL;
|
|
||||||
if (__b < INT_MIN)
|
|
||||||
return __a > 0 ? 0.0 : -0.0;
|
|
||||||
return scalbn(__a, (int)__b);
|
|
||||||
}
|
|
||||||
__DEVICE__ float scalblnf(float __a, long __b) {
|
|
||||||
if (__b > INT_MAX)
|
|
||||||
return __a > 0 ? HUGE_VALF : -HUGE_VALF;
|
|
||||||
if (__b < INT_MIN)
|
|
||||||
return __a > 0 ? 0.f : -0.f;
|
|
||||||
return scalbnf(__a, (int)__b);
|
|
||||||
}
|
|
||||||
__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
|
|
||||||
__DEVICE_VOID__ void sincos(double __a, double *__s, double *__c) {
|
|
||||||
return __nv_sincos(__a, __s, __c);
|
|
||||||
}
|
|
||||||
__DEVICE_VOID__ void sincosf(float __a, float *__s, float *__c) {
|
|
||||||
return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
|
|
||||||
}
|
|
||||||
__DEVICE_VOID__ void sincospi(double __a, double *__s, double *__c) {
|
|
||||||
return __nv_sincospi(__a, __s, __c);
|
|
||||||
}
|
|
||||||
__DEVICE_VOID__ void sincospif(float __a, float *__s, float *__c) {
|
|
||||||
return __nv_sincospif(__a, __s, __c);
|
|
||||||
}
|
|
||||||
__DEVICE__ float sinf(float __a) {
|
|
||||||
return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
|
|
||||||
}
|
|
||||||
__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); }
|
|
||||||
__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); }
|
|
||||||
__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); }
|
|
||||||
__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); }
|
|
||||||
__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); }
|
|
||||||
__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); }
|
|
||||||
__DEVICE__ double tan(double __a) { return __nv_tan(__a); }
|
|
||||||
__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); }
|
|
||||||
__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); }
|
|
||||||
__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); }
|
|
||||||
__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); }
|
|
||||||
__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); }
|
|
||||||
__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); }
|
|
||||||
__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); }
|
|
||||||
__DEVICE__ unsigned long long ullmax(unsigned long long __a,
|
|
||||||
unsigned long long __b) {
|
|
||||||
return __nv_ullmax(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ unsigned long long ullmin(unsigned long long __a,
|
|
||||||
unsigned long long __b) {
|
|
||||||
return __nv_ullmin(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) {
|
|
||||||
return __nv_umax(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) {
|
|
||||||
return __nv_umin(__a, __b);
|
|
||||||
}
|
|
||||||
__DEVICE__ double y0(double __a) { return __nv_y0(__a); }
|
|
||||||
__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); }
|
|
||||||
__DEVICE__ double y1(double __a) { return __nv_y1(__a); }
|
|
||||||
__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
|
|
||||||
__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
|
|
||||||
__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
|
|
||||||
|
|
||||||
#pragma pop_macro("__DEVICE__")
|
|
||||||
#pragma pop_macro("__DEVICE_VOID__")
|
|
||||||
#pragma pop_macro("__FAST_OR_SLOW")
|
|
||||||
|
|
||||||
#endif // __CLANG_GPU_DISABLE_MATH_WRAPPERS
|
|
||||||
#endif // __CLANG_CUDA_MATH_H__
|
|
||||||
@@ -1,284 +0,0 @@
|
|||||||
/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
|
|
||||||
#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
|
|
||||||
#if !defined(__CUDA__) && !__HIP__
|
|
||||||
#error "This file is for CUDA/HIP compilation only."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// This file forward-declares of some math functions we (or the CUDA headers)
|
|
||||||
// will define later. We need to do this, and do it before cmath is included,
|
|
||||||
// because the standard library may have constexpr math functions. In the
|
|
||||||
// absence of a prior __device__ decl, those constexpr functions may become
|
|
||||||
// implicitly host+device. host+device functions can't be overloaded, so that
|
|
||||||
// would preclude the use of our own __device__ overloads for these functions.
|
|
||||||
|
|
||||||
#pragma push_macro("__DEVICE__")
|
|
||||||
#define __DEVICE__ \
|
|
||||||
static __inline__ __attribute__((always_inline)) __attribute__((device))
|
|
||||||
|
|
||||||
__DEVICE__ long abs(long);
|
|
||||||
__DEVICE__ long long abs(long long);
|
|
||||||
__DEVICE__ double abs(double);
|
|
||||||
__DEVICE__ float abs(float);
|
|
||||||
__DEVICE__ int abs(int);
|
|
||||||
__DEVICE__ double acos(double);
|
|
||||||
__DEVICE__ float acos(float);
|
|
||||||
__DEVICE__ double acosh(double);
|
|
||||||
__DEVICE__ float acosh(float);
|
|
||||||
__DEVICE__ double asin(double);
|
|
||||||
__DEVICE__ float asin(float);
|
|
||||||
__DEVICE__ double asinh(double);
|
|
||||||
__DEVICE__ float asinh(float);
|
|
||||||
__DEVICE__ double atan2(double, double);
|
|
||||||
__DEVICE__ float atan2(float, float);
|
|
||||||
__DEVICE__ double atan(double);
|
|
||||||
__DEVICE__ float atan(float);
|
|
||||||
__DEVICE__ double atanh(double);
|
|
||||||
__DEVICE__ float atanh(float);
|
|
||||||
__DEVICE__ double cbrt(double);
|
|
||||||
__DEVICE__ float cbrt(float);
|
|
||||||
__DEVICE__ double ceil(double);
|
|
||||||
__DEVICE__ float ceil(float);
|
|
||||||
__DEVICE__ double copysign(double, double);
|
|
||||||
__DEVICE__ float copysign(float, float);
|
|
||||||
__DEVICE__ double cos(double);
|
|
||||||
__DEVICE__ float cos(float);
|
|
||||||
__DEVICE__ double cosh(double);
|
|
||||||
__DEVICE__ float cosh(float);
|
|
||||||
__DEVICE__ double erfc(double);
|
|
||||||
__DEVICE__ float erfc(float);
|
|
||||||
__DEVICE__ double erf(double);
|
|
||||||
__DEVICE__ float erf(float);
|
|
||||||
__DEVICE__ double exp2(double);
|
|
||||||
__DEVICE__ float exp2(float);
|
|
||||||
__DEVICE__ double exp(double);
|
|
||||||
__DEVICE__ float exp(float);
|
|
||||||
__DEVICE__ double expm1(double);
|
|
||||||
__DEVICE__ float expm1(float);
|
|
||||||
__DEVICE__ double fabs(double);
|
|
||||||
__DEVICE__ float fabs(float);
|
|
||||||
__DEVICE__ double fdim(double, double);
|
|
||||||
__DEVICE__ float fdim(float, float);
|
|
||||||
__DEVICE__ double floor(double);
|
|
||||||
__DEVICE__ float floor(float);
|
|
||||||
__DEVICE__ double fma(double, double, double);
|
|
||||||
__DEVICE__ float fma(float, float, float);
|
|
||||||
__DEVICE__ double fmax(double, double);
|
|
||||||
__DEVICE__ float fmax(float, float);
|
|
||||||
__DEVICE__ double fmin(double, double);
|
|
||||||
__DEVICE__ float fmin(float, float);
|
|
||||||
__DEVICE__ double fmod(double, double);
|
|
||||||
__DEVICE__ float fmod(float, float);
|
|
||||||
__DEVICE__ int fpclassify(double);
|
|
||||||
__DEVICE__ int fpclassify(float);
|
|
||||||
__DEVICE__ double frexp(double, int *);
|
|
||||||
__DEVICE__ float frexp(float, int *);
|
|
||||||
__DEVICE__ double hypot(double, double);
|
|
||||||
__DEVICE__ float hypot(float, float);
|
|
||||||
__DEVICE__ int ilogb(double);
|
|
||||||
__DEVICE__ int ilogb(float);
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
__DEVICE__ bool isfinite(long double);
|
|
||||||
#endif
|
|
||||||
__DEVICE__ bool isfinite(double);
|
|
||||||
__DEVICE__ bool isfinite(float);
|
|
||||||
__DEVICE__ bool isgreater(double, double);
|
|
||||||
__DEVICE__ bool isgreaterequal(double, double);
|
|
||||||
__DEVICE__ bool isgreaterequal(float, float);
|
|
||||||
__DEVICE__ bool isgreater(float, float);
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
__DEVICE__ bool isinf(long double);
|
|
||||||
#endif
|
|
||||||
__DEVICE__ bool isinf(double);
|
|
||||||
__DEVICE__ bool isinf(float);
|
|
||||||
__DEVICE__ bool isless(double, double);
|
|
||||||
__DEVICE__ bool islessequal(double, double);
|
|
||||||
__DEVICE__ bool islessequal(float, float);
|
|
||||||
__DEVICE__ bool isless(float, float);
|
|
||||||
__DEVICE__ bool islessgreater(double, double);
|
|
||||||
__DEVICE__ bool islessgreater(float, float);
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
__DEVICE__ bool isnan(long double);
|
|
||||||
#endif
|
|
||||||
__DEVICE__ bool isnan(double);
|
|
||||||
__DEVICE__ bool isnan(float);
|
|
||||||
__DEVICE__ bool isnormal(double);
|
|
||||||
__DEVICE__ bool isnormal(float);
|
|
||||||
__DEVICE__ bool isunordered(double, double);
|
|
||||||
__DEVICE__ bool isunordered(float, float);
|
|
||||||
__DEVICE__ long labs(long);
|
|
||||||
__DEVICE__ double ldexp(double, int);
|
|
||||||
__DEVICE__ float ldexp(float, int);
|
|
||||||
__DEVICE__ double lgamma(double);
|
|
||||||
__DEVICE__ float lgamma(float);
|
|
||||||
__DEVICE__ long long llabs(long long);
|
|
||||||
__DEVICE__ long long llrint(double);
|
|
||||||
__DEVICE__ long long llrint(float);
|
|
||||||
__DEVICE__ double log10(double);
|
|
||||||
__DEVICE__ float log10(float);
|
|
||||||
__DEVICE__ double log1p(double);
|
|
||||||
__DEVICE__ float log1p(float);
|
|
||||||
__DEVICE__ double log2(double);
|
|
||||||
__DEVICE__ float log2(float);
|
|
||||||
__DEVICE__ double logb(double);
|
|
||||||
__DEVICE__ float logb(float);
|
|
||||||
__DEVICE__ double log(double);
|
|
||||||
__DEVICE__ float log(float);
|
|
||||||
__DEVICE__ long lrint(double);
|
|
||||||
__DEVICE__ long lrint(float);
|
|
||||||
__DEVICE__ long lround(double);
|
|
||||||
__DEVICE__ long lround(float);
|
|
||||||
__DEVICE__ long long llround(float); // No llround(double).
|
|
||||||
__DEVICE__ double modf(double, double *);
|
|
||||||
__DEVICE__ float modf(float, float *);
|
|
||||||
__DEVICE__ double nan(const char *);
|
|
||||||
__DEVICE__ float nanf(const char *);
|
|
||||||
__DEVICE__ double nearbyint(double);
|
|
||||||
__DEVICE__ float nearbyint(float);
|
|
||||||
__DEVICE__ double nextafter(double, double);
|
|
||||||
__DEVICE__ float nextafter(float, float);
|
|
||||||
__DEVICE__ double pow(double, double);
|
|
||||||
__DEVICE__ double pow(double, int);
|
|
||||||
__DEVICE__ float pow(float, float);
|
|
||||||
__DEVICE__ float pow(float, int);
|
|
||||||
__DEVICE__ double remainder(double, double);
|
|
||||||
__DEVICE__ float remainder(float, float);
|
|
||||||
__DEVICE__ double remquo(double, double, int *);
|
|
||||||
__DEVICE__ float remquo(float, float, int *);
|
|
||||||
__DEVICE__ double rint(double);
|
|
||||||
__DEVICE__ float rint(float);
|
|
||||||
__DEVICE__ double round(double);
|
|
||||||
__DEVICE__ float round(float);
|
|
||||||
__DEVICE__ double scalbln(double, long);
|
|
||||||
__DEVICE__ float scalbln(float, long);
|
|
||||||
__DEVICE__ double scalbn(double, int);
|
|
||||||
__DEVICE__ float scalbn(float, int);
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
__DEVICE__ bool signbit(long double);
|
|
||||||
#endif
|
|
||||||
__DEVICE__ bool signbit(double);
|
|
||||||
__DEVICE__ bool signbit(float);
|
|
||||||
__DEVICE__ double sin(double);
|
|
||||||
__DEVICE__ float sin(float);
|
|
||||||
__DEVICE__ double sinh(double);
|
|
||||||
__DEVICE__ float sinh(float);
|
|
||||||
__DEVICE__ double sqrt(double);
|
|
||||||
__DEVICE__ float sqrt(float);
|
|
||||||
__DEVICE__ double tan(double);
|
|
||||||
__DEVICE__ float tan(float);
|
|
||||||
__DEVICE__ double tanh(double);
|
|
||||||
__DEVICE__ float tanh(float);
|
|
||||||
__DEVICE__ double tgamma(double);
|
|
||||||
__DEVICE__ float tgamma(float);
|
|
||||||
__DEVICE__ double trunc(double);
|
|
||||||
__DEVICE__ float trunc(float);
|
|
||||||
|
|
||||||
// Notably missing above is nexttoward, which we don't define on
|
|
||||||
// the device side because libdevice doesn't give us an implementation, and we
|
|
||||||
// don't want to be in the business of writing one ourselves.
|
|
||||||
|
|
||||||
// We need to define these overloads in exactly the namespace our standard
|
|
||||||
// library uses (including the right inline namespace), otherwise they won't be
|
|
||||||
// picked up by other functions in the standard library (e.g. functions in
|
|
||||||
// <complex>). Thus the ugliness below.
|
|
||||||
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
|
|
||||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
|
||||||
#else
|
|
||||||
namespace std {
|
|
||||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
_GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
using ::abs;
|
|
||||||
using ::acos;
|
|
||||||
using ::acosh;
|
|
||||||
using ::asin;
|
|
||||||
using ::asinh;
|
|
||||||
using ::atan;
|
|
||||||
using ::atan2;
|
|
||||||
using ::atanh;
|
|
||||||
using ::cbrt;
|
|
||||||
using ::ceil;
|
|
||||||
using ::copysign;
|
|
||||||
using ::cos;
|
|
||||||
using ::cosh;
|
|
||||||
using ::erf;
|
|
||||||
using ::erfc;
|
|
||||||
using ::exp;
|
|
||||||
using ::exp2;
|
|
||||||
using ::expm1;
|
|
||||||
using ::fabs;
|
|
||||||
using ::fdim;
|
|
||||||
using ::floor;
|
|
||||||
using ::fma;
|
|
||||||
using ::fmax;
|
|
||||||
using ::fmin;
|
|
||||||
using ::fmod;
|
|
||||||
using ::fpclassify;
|
|
||||||
using ::frexp;
|
|
||||||
using ::hypot;
|
|
||||||
using ::ilogb;
|
|
||||||
using ::isfinite;
|
|
||||||
using ::isgreater;
|
|
||||||
using ::isgreaterequal;
|
|
||||||
using ::isinf;
|
|
||||||
using ::isless;
|
|
||||||
using ::islessequal;
|
|
||||||
using ::islessgreater;
|
|
||||||
using ::isnan;
|
|
||||||
using ::isnormal;
|
|
||||||
using ::isunordered;
|
|
||||||
using ::labs;
|
|
||||||
using ::ldexp;
|
|
||||||
using ::lgamma;
|
|
||||||
using ::llabs;
|
|
||||||
using ::llrint;
|
|
||||||
using ::log;
|
|
||||||
using ::log10;
|
|
||||||
using ::log1p;
|
|
||||||
using ::log2;
|
|
||||||
using ::logb;
|
|
||||||
using ::lrint;
|
|
||||||
using ::lround;
|
|
||||||
using ::llround;
|
|
||||||
using ::modf;
|
|
||||||
using ::nan;
|
|
||||||
using ::nanf;
|
|
||||||
using ::nearbyint;
|
|
||||||
using ::nextafter;
|
|
||||||
using ::pow;
|
|
||||||
using ::remainder;
|
|
||||||
using ::remquo;
|
|
||||||
using ::rint;
|
|
||||||
using ::round;
|
|
||||||
using ::scalbln;
|
|
||||||
using ::scalbn;
|
|
||||||
using ::signbit;
|
|
||||||
using ::sin;
|
|
||||||
using ::sinh;
|
|
||||||
using ::sqrt;
|
|
||||||
using ::tan;
|
|
||||||
using ::tanh;
|
|
||||||
using ::tgamma;
|
|
||||||
using ::trunc;
|
|
||||||
|
|
||||||
#ifdef _LIBCPP_END_NAMESPACE_STD
|
|
||||||
_LIBCPP_END_NAMESPACE_STD
|
|
||||||
#else
|
|
||||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
_GLIBCXX_END_NAMESPACE_VERSION
|
|
||||||
#endif
|
|
||||||
} // namespace std
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#pragma pop_macro("__DEVICE__")
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,504 +0,0 @@
|
|||||||
/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* WARNING: This header is intended to be directly -include'd by
|
|
||||||
* the compiler and is not supposed to be included by users.
|
|
||||||
*
|
|
||||||
* CUDA headers are implemented in a way that currently makes it
|
|
||||||
* impossible for user code to #include directly when compiling with
|
|
||||||
* Clang. They present different view of CUDA-supplied functions
|
|
||||||
* depending on where in NVCC's compilation pipeline the headers are
|
|
||||||
* included. Neither of these modes provides function definitions with
|
|
||||||
* correct attributes, so we use preprocessor to force the headers
|
|
||||||
* into a form that Clang can use.
|
|
||||||
*
|
|
||||||
* Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
|
|
||||||
* this file during every CUDA compilation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
|
|
||||||
#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
|
|
||||||
|
|
||||||
#if defined(__CUDA__) && defined(__clang__)
|
|
||||||
|
|
||||||
// Include some forward declares that must come before cmath.
|
|
||||||
#include <__clang_cuda_math_forward_declares.h>
|
|
||||||
|
|
||||||
// Define __CUDACC__ early as libstdc++ standard headers with GNU extensions
|
|
||||||
// enabled depend on it to avoid using __float128, which is unsupported in
|
|
||||||
// CUDA.
|
|
||||||
#define __CUDACC__
|
|
||||||
|
|
||||||
// Include some standard headers to avoid CUDA headers including them
|
|
||||||
// while some required macros (like __THROW) are in a weird state.
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#undef __CUDACC__
|
|
||||||
|
|
||||||
// Preserve common macros that will be changed below by us or by CUDA
|
|
||||||
// headers.
|
|
||||||
#pragma push_macro("__THROW")
|
|
||||||
#pragma push_macro("__CUDA_ARCH__")
|
|
||||||
|
|
||||||
// WARNING: Preprocessor hacks below are based on specific details of
|
|
||||||
// CUDA-7.x headers and are not expected to work with any other
|
|
||||||
// version of CUDA headers.
|
|
||||||
#include "cuda.h"
|
|
||||||
#if !defined(CUDA_VERSION)
|
|
||||||
#error "cuda.h did not define CUDA_VERSION"
|
|
||||||
#elif CUDA_VERSION < 7000
|
|
||||||
#error "Unsupported CUDA version!"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#pragma push_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
|
|
||||||
#if CUDA_VERSION >= 10000
|
|
||||||
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Make largest subset of device functions available during host
|
|
||||||
// compilation.
|
|
||||||
#ifndef __CUDA_ARCH__
|
|
||||||
#define __CUDA_ARCH__ 9999
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "__clang_cuda_builtin_vars.h"
|
|
||||||
|
|
||||||
// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
|
|
||||||
// has taken care of builtin variables declared in the file.
|
|
||||||
#define __DEVICE_LAUNCH_PARAMETERS_H__
|
|
||||||
|
|
||||||
// {math,device}_functions.h only have declarations of the
|
|
||||||
// functions. We don't need them as we're going to pull in their
|
|
||||||
// definitions from .hpp files.
|
|
||||||
#define __DEVICE_FUNCTIONS_H__
|
|
||||||
#define __MATH_FUNCTIONS_H__
|
|
||||||
#define __COMMON_FUNCTIONS_H__
|
|
||||||
// device_functions_decls is replaced by __clang_cuda_device_functions.h
|
|
||||||
// included below.
|
|
||||||
#define __DEVICE_FUNCTIONS_DECLS_H__
|
|
||||||
|
|
||||||
#undef __CUDACC__
|
|
||||||
#if CUDA_VERSION < 9000
|
|
||||||
#define __CUDABE__
|
|
||||||
#else
|
|
||||||
#define __CUDACC__
|
|
||||||
#define __CUDA_LIBDEVICE__
|
|
||||||
#endif
|
|
||||||
// Disables definitions of device-side runtime support stubs in
|
|
||||||
// cuda_device_runtime_api.h
|
|
||||||
#include "host_defines.h"
|
|
||||||
#undef __CUDACC__
|
|
||||||
#include "driver_types.h"
|
|
||||||
#include "host_config.h"
|
|
||||||
|
|
||||||
// Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in
|
|
||||||
// cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the
|
|
||||||
// functional equivalent of what we need.
|
|
||||||
#pragma push_macro("nv_weak")
|
|
||||||
#define nv_weak weak
|
|
||||||
#undef __CUDABE__
|
|
||||||
#undef __CUDA_LIBDEVICE__
|
|
||||||
#define __CUDACC__
|
|
||||||
#include "cuda_runtime.h"
|
|
||||||
|
|
||||||
#pragma pop_macro("nv_weak")
|
|
||||||
#undef __CUDACC__
|
|
||||||
#define __CUDABE__
|
|
||||||
|
|
||||||
// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
|
|
||||||
// not have at the moment. Emulate them with a builtin memcpy/memset.
|
|
||||||
#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
|
|
||||||
#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
|
|
||||||
|
|
||||||
#if CUDA_VERSION < 9000
|
|
||||||
#include "crt/device_runtime.h"
|
|
||||||
#endif
|
|
||||||
#include "crt/host_runtime.h"
|
|
||||||
// device_runtime.h defines __cxa_* macros that will conflict with
|
|
||||||
// cxxabi.h.
|
|
||||||
// FIXME: redefine these as __device__ functions.
|
|
||||||
#undef __cxa_vec_ctor
|
|
||||||
#undef __cxa_vec_cctor
|
|
||||||
#undef __cxa_vec_dtor
|
|
||||||
#undef __cxa_vec_new
|
|
||||||
#undef __cxa_vec_new2
|
|
||||||
#undef __cxa_vec_new3
|
|
||||||
#undef __cxa_vec_delete2
|
|
||||||
#undef __cxa_vec_delete
|
|
||||||
#undef __cxa_vec_delete3
|
|
||||||
#undef __cxa_pure_virtual
|
|
||||||
|
|
||||||
// math_functions.hpp expects this host function be defined on MacOS, but it
|
|
||||||
// ends up not being there because of the games we play here. Just define it
|
|
||||||
// ourselves; it's simple enough.
|
|
||||||
#ifdef __APPLE__
|
|
||||||
inline __host__ double __signbitd(double x) {
|
|
||||||
return std::signbit(x);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// CUDA 9.1 no longer provides declarations for libdevice functions, so we need
|
|
||||||
// to provide our own.
|
|
||||||
#include <__clang_cuda_libdevice_declares.h>
|
|
||||||
|
|
||||||
// Wrappers for many device-side standard library functions, incl. math
|
|
||||||
// functions, became compiler builtins in CUDA-9 and have been removed from the
|
|
||||||
// CUDA headers. Clang now provides its own implementation of the wrappers.
|
|
||||||
#if CUDA_VERSION >= 9000
|
|
||||||
#include <__clang_cuda_device_functions.h>
|
|
||||||
#include <__clang_cuda_math.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
|
|
||||||
// counterpart does not do it, so we need to make it empty here to keep
|
|
||||||
// following CUDA includes happy.
|
|
||||||
#undef __THROW
|
|
||||||
#define __THROW
|
|
||||||
|
|
||||||
// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
|
|
||||||
// Previous versions used to check whether they are defined or not.
|
|
||||||
// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
|
|
||||||
// here to detect the switch.
|
|
||||||
|
|
||||||
#if defined(CU_DEVICE_INVALID)
|
|
||||||
#if !defined(__USE_FAST_MATH__)
|
|
||||||
#define __USE_FAST_MATH__ 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(__CUDA_PREC_DIV)
|
|
||||||
#define __CUDA_PREC_DIV 0
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Temporarily poison __host__ macro to ensure it's not used by any of
|
|
||||||
// the headers we're about to include.
|
|
||||||
#pragma push_macro("__host__")
|
|
||||||
#define __host__ UNEXPECTED_HOST_ATTRIBUTE
|
|
||||||
|
|
||||||
// device_functions.hpp and math_functions*.hpp use 'static
|
|
||||||
// __forceinline__' (with no __device__) for definitions of device
|
|
||||||
// functions. Temporarily redefine __forceinline__ to include
|
|
||||||
// __device__.
|
|
||||||
#pragma push_macro("__forceinline__")
|
|
||||||
#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
|
|
||||||
#if CUDA_VERSION < 9000
|
|
||||||
#include "device_functions.hpp"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
|
|
||||||
// get the slow-but-accurate or fast-but-inaccurate versions of functions like
|
|
||||||
// sin and exp. This is controlled in clang by -fgpu-approx-transcendentals.
|
|
||||||
//
|
|
||||||
// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
|
|
||||||
// slow divides), so we need to scope our define carefully here.
|
|
||||||
#pragma push_macro("__USE_FAST_MATH__")
|
|
||||||
#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
|
|
||||||
#define __USE_FAST_MATH__ 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if CUDA_VERSION >= 9000
|
|
||||||
#include "crt/math_functions.hpp"
|
|
||||||
#else
|
|
||||||
#include "math_functions.hpp"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#pragma pop_macro("__USE_FAST_MATH__")
|
|
||||||
|
|
||||||
#if CUDA_VERSION < 9000
|
|
||||||
#include "math_functions_dbl_ptx3.hpp"
|
|
||||||
#endif
|
|
||||||
#pragma pop_macro("__forceinline__")
|
|
||||||
|
|
||||||
// Pull in host-only functions that are only available when neither
|
|
||||||
// __CUDACC__ nor __CUDABE__ are defined.
|
|
||||||
#undef __MATH_FUNCTIONS_HPP__
|
|
||||||
#undef __CUDABE__
|
|
||||||
#if CUDA_VERSION < 9000
|
|
||||||
#include "math_functions.hpp"
|
|
||||||
#endif
|
|
||||||
// Alas, additional overloads for these functions are hard to get to.
|
|
||||||
// Considering that we only need these overloads for a few functions,
|
|
||||||
// we can provide them here.
|
|
||||||
static inline float rsqrt(float __a) { return rsqrtf(__a); }
|
|
||||||
static inline float rcbrt(float __a) { return rcbrtf(__a); }
|
|
||||||
static inline float sinpi(float __a) { return sinpif(__a); }
|
|
||||||
static inline float cospi(float __a) { return cospif(__a); }
|
|
||||||
static inline void sincospi(float __a, float *__b, float *__c) {
|
|
||||||
return sincospif(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static inline float erfcinv(float __a) { return erfcinvf(__a); }
|
|
||||||
static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
|
|
||||||
static inline float normcdf(float __a) { return normcdff(__a); }
|
|
||||||
static inline float erfcx(float __a) { return erfcxf(__a); }
|
|
||||||
|
|
||||||
#if CUDA_VERSION < 9000
|
|
||||||
// For some reason single-argument variant is not always declared by
|
|
||||||
// CUDA headers. Alas, device_functions.hpp included below needs it.
|
|
||||||
static inline __device__ void __brkpt(int __c) { __brkpt(); }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Now include *.hpp with definitions of various GPU functions. Alas,
|
|
||||||
// a lot of thins get declared/defined with __host__ attribute which
|
|
||||||
// we don't want and we have to define it out. We also have to include
|
|
||||||
// {device,math}_functions.hpp again in order to extract the other
|
|
||||||
// branch of #if/else inside.
|
|
||||||
#define __host__
|
|
||||||
#undef __CUDABE__
|
|
||||||
#define __CUDACC__
|
|
||||||
#if CUDA_VERSION >= 9000
|
|
||||||
// Some atomic functions became compiler builtins in CUDA-9 , so we need their
|
|
||||||
// declarations.
|
|
||||||
#include "device_atomic_functions.h"
|
|
||||||
#endif
|
|
||||||
#undef __DEVICE_FUNCTIONS_HPP__
|
|
||||||
#include "device_atomic_functions.hpp"
|
|
||||||
#if CUDA_VERSION >= 9000
|
|
||||||
#include "crt/device_functions.hpp"
|
|
||||||
#include "crt/device_double_functions.hpp"
|
|
||||||
#else
|
|
||||||
#include "device_functions.hpp"
|
|
||||||
#define __CUDABE__
|
|
||||||
#include "device_double_functions.h"
|
|
||||||
#undef __CUDABE__
|
|
||||||
#endif
|
|
||||||
#include "sm_20_atomic_functions.hpp"
|
|
||||||
// Predicate functions used in `__builtin_assume` need to have no side effect.
|
|
||||||
// However, sm_20_intrinsics.hpp doesn't define them with neither pure nor
|
|
||||||
// const attribute. Rename definitions from sm_20_intrinsics.hpp and re-define
|
|
||||||
// them as pure ones.
|
|
||||||
#pragma push_macro("__isGlobal")
|
|
||||||
#pragma push_macro("__isShared")
|
|
||||||
#pragma push_macro("__isConstant")
|
|
||||||
#pragma push_macro("__isLocal")
|
|
||||||
#define __isGlobal __ignored_cuda___isGlobal
|
|
||||||
#define __isShared __ignored_cuda___isShared
|
|
||||||
#define __isConstant __ignored_cuda___isConstant
|
|
||||||
#define __isLocal __ignored_cuda___isLocal
|
|
||||||
#include "sm_20_intrinsics.hpp"
|
|
||||||
#pragma pop_macro("__isGlobal")
|
|
||||||
#pragma pop_macro("__isShared")
|
|
||||||
#pragma pop_macro("__isConstant")
|
|
||||||
#pragma pop_macro("__isLocal")
|
|
||||||
#pragma push_macro("__DEVICE__")
|
|
||||||
#define __DEVICE__ static __device__ __forceinline__ __attribute__((const))
|
|
||||||
__DEVICE__ unsigned int __isGlobal(const void *p) {
|
|
||||||
return __nvvm_isspacep_global(p);
|
|
||||||
}
|
|
||||||
__DEVICE__ unsigned int __isShared(const void *p) {
|
|
||||||
return __nvvm_isspacep_shared(p);
|
|
||||||
}
|
|
||||||
__DEVICE__ unsigned int __isConstant(const void *p) {
|
|
||||||
return __nvvm_isspacep_const(p);
|
|
||||||
}
|
|
||||||
__DEVICE__ unsigned int __isLocal(const void *p) {
|
|
||||||
return __nvvm_isspacep_local(p);
|
|
||||||
}
|
|
||||||
#pragma pop_macro("__DEVICE__")
|
|
||||||
#include "sm_32_atomic_functions.hpp"
|
|
||||||
|
|
||||||
// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
|
|
||||||
// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
|
|
||||||
// define them using builtins so that the optimizer can reason about and across
|
|
||||||
// these instructions. In particular, using intrinsics for ldg gets us the
|
|
||||||
// [addr+imm] addressing mode, which, although it doesn't actually exist in the
|
|
||||||
// hardware, seems to generate faster machine code because ptxas can more easily
|
|
||||||
// reason about our code.
|
|
||||||
|
|
||||||
#if CUDA_VERSION >= 8000
|
|
||||||
#pragma push_macro("__CUDA_ARCH__")
|
|
||||||
#undef __CUDA_ARCH__
|
|
||||||
#include "sm_60_atomic_functions.hpp"
|
|
||||||
#include "sm_61_intrinsics.hpp"
|
|
||||||
#pragma pop_macro("__CUDA_ARCH__")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef __MATH_FUNCTIONS_HPP__
|
|
||||||
|
|
||||||
// math_functions.hpp defines ::signbit as a __host__ __device__ function. This
|
|
||||||
// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
|
|
||||||
// math_function.hpp's ::signbit. It's guarded by #undef signbit, but that's
|
|
||||||
// conditional on __GNUC__. :)
|
|
||||||
#pragma push_macro("signbit")
|
|
||||||
#pragma push_macro("__GNUC__")
|
|
||||||
#undef __GNUC__
|
|
||||||
#define signbit __ignored_cuda_signbit
|
|
||||||
|
|
||||||
// CUDA-9 omits device-side definitions of some math functions if it sees
|
|
||||||
// include guard from math.h wrapper from libstdc++. We have to undo the header
|
|
||||||
// guard temporarily to get the definitions we need.
|
|
||||||
#pragma push_macro("_GLIBCXX_MATH_H")
|
|
||||||
#pragma push_macro("_LIBCPP_VERSION")
|
|
||||||
#if CUDA_VERSION >= 9000
|
|
||||||
#undef _GLIBCXX_MATH_H
|
|
||||||
// We also need to undo another guard that checks for libc++ 3.8+
|
|
||||||
#ifdef _LIBCPP_VERSION
|
|
||||||
#define _LIBCPP_VERSION 3700
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if CUDA_VERSION >= 9000
|
|
||||||
#include "crt/math_functions.hpp"
|
|
||||||
#else
|
|
||||||
#include "math_functions.hpp"
|
|
||||||
#endif
|
|
||||||
#pragma pop_macro("_GLIBCXX_MATH_H")
|
|
||||||
#pragma pop_macro("_LIBCPP_VERSION")
|
|
||||||
#pragma pop_macro("__GNUC__")
|
|
||||||
#pragma pop_macro("signbit")
|
|
||||||
|
|
||||||
#pragma pop_macro("__host__")
|
|
||||||
|
|
||||||
// __clang_cuda_texture_intrinsics.h must be included first in order to provide
|
|
||||||
// implementation for __nv_tex_surf_handler that CUDA's headers depend on.
|
|
||||||
// The implementation requires c++11 and only works with CUDA-9 or newer.
|
|
||||||
#if __cplusplus >= 201103L && CUDA_VERSION >= 9000
|
|
||||||
// clang-format off
|
|
||||||
#include <__clang_cuda_texture_intrinsics.h>
|
|
||||||
// clang-format on
|
|
||||||
#else
|
|
||||||
#if CUDA_VERSION >= 9000
|
|
||||||
// Provide a hint that texture support needs C++11.
|
|
||||||
template <typename T> struct __nv_tex_needs_cxx11 {
|
|
||||||
const static bool value = false;
|
|
||||||
};
|
|
||||||
template <class T>
|
|
||||||
__host__ __device__ void __nv_tex_surf_handler(const char *name, T *ptr,
|
|
||||||
cudaTextureObject_t obj,
|
|
||||||
float x) {
|
|
||||||
_Static_assert(__nv_tex_needs_cxx11<T>::value,
|
|
||||||
"Texture support requires C++11");
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
// Textures in CUDA-8 and older are not supported by clang.There's no
|
|
||||||
// convenient way to intercept texture use in these versions, so we can't
|
|
||||||
// produce a meaningful error. The source code that attempts to use textures
|
|
||||||
// will continue to fail as it does now.
|
|
||||||
#endif // CUDA_VERSION
|
|
||||||
#endif // __cplusplus >= 201103L && CUDA_VERSION >= 9000
|
|
||||||
#include "surface_indirect_functions.h"
|
|
||||||
#include "texture_fetch_functions.h"
|
|
||||||
#include "texture_indirect_functions.h"
|
|
||||||
|
|
||||||
// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
|
|
||||||
#pragma pop_macro("__CUDA_ARCH__")
|
|
||||||
#pragma pop_macro("__THROW")
|
|
||||||
|
|
||||||
// Set up compiler macros expected to be seen during compilation.
|
|
||||||
#undef __CUDABE__
|
|
||||||
#define __CUDACC__
|
|
||||||
|
|
||||||
extern "C" {
|
|
||||||
// Device-side CUDA system calls.
|
|
||||||
// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
|
|
||||||
// We need these declarations and wrappers for device-side
|
|
||||||
// malloc/free/printf calls to work without relying on
|
|
||||||
// -fcuda-disable-target-call-checks option.
|
|
||||||
__device__ int vprintf(const char *, const char *);
|
|
||||||
__device__ void free(void *) __attribute((nothrow));
|
|
||||||
__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
|
|
||||||
|
|
||||||
// __assertfail() used to have a `noreturn` attribute. Unfortunately that
|
|
||||||
// contributed to triggering the longstanding bug in ptxas when assert was used
|
|
||||||
// in sufficiently convoluted code. See
|
|
||||||
// https://bugs.llvm.org/show_bug.cgi?id=27738 for the details.
|
|
||||||
__device__ void __assertfail(const char *__message, const char *__file,
|
|
||||||
unsigned __line, const char *__function,
|
|
||||||
size_t __charSize);
|
|
||||||
|
|
||||||
// In order for standard assert() macro on linux to work we need to
|
|
||||||
// provide device-side __assert_fail()
|
|
||||||
__device__ static inline void __assert_fail(const char *__message,
|
|
||||||
const char *__file, unsigned __line,
|
|
||||||
const char *__function) {
|
|
||||||
__assertfail(__message, __file, __line, __function, sizeof(char));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clang will convert printf into vprintf, but we still need
|
|
||||||
// device-side declaration for it.
|
|
||||||
__device__ int printf(const char *, ...);
|
|
||||||
} // extern "C"
|
|
||||||
|
|
||||||
// We also need device-side std::malloc and std::free.
|
|
||||||
namespace std {
|
|
||||||
__device__ static inline void free(void *__ptr) { ::free(__ptr); }
|
|
||||||
__device__ static inline void *malloc(size_t __size) {
|
|
||||||
return ::malloc(__size);
|
|
||||||
}
|
|
||||||
} // namespace std
|
|
||||||
|
|
||||||
// Out-of-line implementations from __clang_cuda_builtin_vars.h. These need to
|
|
||||||
// come after we've pulled in the definition of uint3 and dim3.
|
|
||||||
|
|
||||||
__device__ inline __cuda_builtin_threadIdx_t::operator dim3() const {
|
|
||||||
return dim3(x, y, z);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
|
|
||||||
return {x, y, z};
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline __cuda_builtin_blockIdx_t::operator dim3() const {
|
|
||||||
return dim3(x, y, z);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
|
|
||||||
return {x, y, z};
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
|
|
||||||
return dim3(x, y, z);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline __cuda_builtin_blockDim_t::operator uint3() const {
|
|
||||||
return {x, y, z};
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
|
|
||||||
return dim3(x, y, z);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ inline __cuda_builtin_gridDim_t::operator uint3() const {
|
|
||||||
return {x, y, z};
|
|
||||||
}
|
|
||||||
|
|
||||||
#include <__clang_cuda_cmath.h>
|
|
||||||
#include <__clang_cuda_intrinsics.h>
|
|
||||||
#include <__clang_cuda_complex_builtins.h>
|
|
||||||
|
|
||||||
// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
|
|
||||||
// mode, giving them their "proper" types of dim3 and uint3. This is
|
|
||||||
// incompatible with the types we give in __clang_cuda_builtin_vars.h. As as
|
|
||||||
// hack, force-include the header (nvcc doesn't include it by default) but
|
|
||||||
// redefine dim3 and uint3 to our builtin types. (Thankfully dim3 and uint3 are
|
|
||||||
// only used here for the redeclarations of blockDim and threadIdx.)
|
|
||||||
#pragma push_macro("dim3")
|
|
||||||
#pragma push_macro("uint3")
|
|
||||||
#define dim3 __cuda_builtin_blockDim_t
|
|
||||||
#define uint3 __cuda_builtin_threadIdx_t
|
|
||||||
#include "curand_mtgp32_kernel.h"
|
|
||||||
#pragma pop_macro("dim3")
|
|
||||||
#pragma pop_macro("uint3")
|
|
||||||
#pragma pop_macro("__USE_FAST_MATH__")
|
|
||||||
#pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
|
|
||||||
|
|
||||||
// CUDA runtime uses this undocumented function to access kernel launch
|
|
||||||
// configuration. The declaration is in crt/device_functions.h but that file
|
|
||||||
// includes a lot of other stuff we don't want. Instead, we'll provide our own
|
|
||||||
// declaration for it here.
|
|
||||||
#if CUDA_VERSION >= 9020
|
|
||||||
extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
|
|
||||||
size_t sharedMem = 0,
|
|
||||||
void *stream = 0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // __CUDA__
|
|
||||||
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,848 +0,0 @@
|
|||||||
/*===---- __clang_hip_cmath.h - HIP cmath decls -----------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __CLANG_HIP_CMATH_H__
|
|
||||||
#define __CLANG_HIP_CMATH_H__
|
|
||||||
|
|
||||||
#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
|
|
||||||
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(__HIPCC_RTC__)
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
#include <limits>
|
|
||||||
#include <type_traits>
|
|
||||||
#include <utility>
|
|
||||||
#endif
|
|
||||||
#include <limits.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#endif // !defined(__HIPCC_RTC__)
|
|
||||||
|
|
||||||
#pragma push_macro("__DEVICE__")
|
|
||||||
#pragma push_macro("__CONSTEXPR__")
|
|
||||||
#ifdef __OPENMP_AMDGCN__
|
|
||||||
#define __DEVICE__ static __attribute__((always_inline, nothrow))
|
|
||||||
#define __CONSTEXPR__ constexpr
|
|
||||||
#else
|
|
||||||
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
|
|
||||||
#define __CONSTEXPR__
|
|
||||||
#endif // __OPENMP_AMDGCN__
|
|
||||||
|
|
||||||
// Start with functions that cannot be defined by DEF macros below.
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
#if defined __OPENMP_AMDGCN__
|
|
||||||
__DEVICE__ __CONSTEXPR__ float fabs(float __x) { return ::fabsf(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ float sin(float __x) { return ::sinf(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ float cos(float __x) { return ::cosf(__x); }
|
|
||||||
#endif
|
|
||||||
__DEVICE__ __CONSTEXPR__ double abs(double __x) { return ::fabs(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ float abs(float __x) { return ::fabsf(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ long long abs(long long __n) { return ::llabs(__n); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ long abs(long __n) { return ::labs(__n); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ float fma(float __x, float __y, float __z) {
|
|
||||||
return ::fmaf(__x, __y, __z);
|
|
||||||
}
|
|
||||||
#if !defined(__HIPCC_RTC__)
|
|
||||||
// The value returned by fpclassify is platform dependent, therefore it is not
|
|
||||||
// supported by hipRTC.
|
|
||||||
__DEVICE__ __CONSTEXPR__ int fpclassify(float __x) {
|
|
||||||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
|
||||||
FP_ZERO, __x);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ int fpclassify(double __x) {
|
|
||||||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
|
||||||
FP_ZERO, __x);
|
|
||||||
}
|
|
||||||
#endif // !defined(__HIPCC_RTC__)
|
|
||||||
|
|
||||||
__DEVICE__ __CONSTEXPR__ float frexp(float __arg, int *__exp) {
|
|
||||||
return ::frexpf(__arg, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(__OPENMP_AMDGCN__)
|
|
||||||
// For OpenMP we work around some old system headers that have non-conforming
|
|
||||||
// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
|
|
||||||
// this by providing two versions of these functions, differing only in the
|
|
||||||
// return type. To avoid conflicting definitions we disable implicit base
|
|
||||||
// function generation. That means we will end up with two specializations, one
|
|
||||||
// per type, but only one has a base function defined by the system header.
|
|
||||||
#pragma omp begin declare variant match( \
|
|
||||||
implementation = {extension(disable_implicit_base)})
|
|
||||||
|
|
||||||
// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
|
|
||||||
// add a suffix. This means we would clash with the names of the variants
|
|
||||||
// (note that we do not create implicit base functions here). To avoid
|
|
||||||
// this clash we add a new trait to some of them that is always true
|
|
||||||
// (this is LLVM after all ;)). It will only influence the mangled name
|
|
||||||
// of the variants inside the inner region and avoid the clash.
|
|
||||||
#pragma omp begin declare variant match(implementation = {vendor(llvm)})
|
|
||||||
|
|
||||||
__DEVICE__ __CONSTEXPR__ int isinf(float __x) { return ::__isinff(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ int isinf(double __x) { return ::__isinf(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ int isfinite(float __x) { return ::__finitef(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ int isfinite(double __x) { return ::__finite(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ int isnan(float __x) { return ::__isnanf(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ int isnan(double __x) { return ::__isnan(__x); }
|
|
||||||
|
|
||||||
#pragma omp end declare variant
|
|
||||||
#endif // defined(__OPENMP_AMDGCN__)
|
|
||||||
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isinf(float __x) { return ::__isinff(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isinf(double __x) { return ::__isinf(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isfinite(float __x) { return ::__finitef(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isfinite(double __x) { return ::__finite(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isnan(float __x) { return ::__isnanf(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isnan(double __x) { return ::__isnan(__x); }
|
|
||||||
|
|
||||||
#if defined(__OPENMP_AMDGCN__)
|
|
||||||
#pragma omp end declare variant
|
|
||||||
#endif // defined(__OPENMP_AMDGCN__)
|
|
||||||
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isgreater(float __x, float __y) {
|
|
||||||
return __builtin_isgreater(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isgreater(double __x, double __y) {
|
|
||||||
return __builtin_isgreater(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isgreaterequal(float __x, float __y) {
|
|
||||||
return __builtin_isgreaterequal(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isgreaterequal(double __x, double __y) {
|
|
||||||
return __builtin_isgreaterequal(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isless(float __x, float __y) {
|
|
||||||
return __builtin_isless(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isless(double __x, double __y) {
|
|
||||||
return __builtin_isless(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool islessequal(float __x, float __y) {
|
|
||||||
return __builtin_islessequal(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool islessequal(double __x, double __y) {
|
|
||||||
return __builtin_islessequal(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool islessgreater(float __x, float __y) {
|
|
||||||
return __builtin_islessgreater(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool islessgreater(double __x, double __y) {
|
|
||||||
return __builtin_islessgreater(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isnormal(float __x) {
|
|
||||||
return __builtin_isnormal(__x);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isnormal(double __x) {
|
|
||||||
return __builtin_isnormal(__x);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isunordered(float __x, float __y) {
|
|
||||||
return __builtin_isunordered(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool isunordered(double __x, double __y) {
|
|
||||||
return __builtin_isunordered(__x, __y);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ float modf(float __x, float *__iptr) {
|
|
||||||
return ::modff(__x, __iptr);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ float pow(float __base, int __iexp) {
|
|
||||||
return ::powif(__base, __iexp);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ double pow(double __base, int __iexp) {
|
|
||||||
return ::powi(__base, __iexp);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ float remquo(float __x, float __y, int *__quo) {
|
|
||||||
return ::remquof(__x, __y, __quo);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ float scalbln(float __x, long int __n) {
|
|
||||||
return ::scalblnf(__x, __n);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool signbit(float __x) { return ::__signbitf(__x); }
|
|
||||||
__DEVICE__ __CONSTEXPR__ bool signbit(double __x) { return ::__signbit(__x); }
|
|
||||||
|
|
||||||
// Notably missing above is nexttoward. We omit it because
|
|
||||||
// ocml doesn't provide an implementation, and we don't want to be in the
|
|
||||||
// business of implementing tricky libm functions in this header.
|
|
||||||
|
|
||||||
// Other functions.
|
|
||||||
__DEVICE__ __CONSTEXPR__ _Float16 fma(_Float16 __x, _Float16 __y,
|
|
||||||
_Float16 __z) {
|
|
||||||
return __builtin_fmaf16(__x, __y, __z);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ _Float16 pow(_Float16 __base, int __iexp) {
|
|
||||||
return __ocml_pown_f16(__base, __iexp);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef __OPENMP_AMDGCN__
|
|
||||||
// BEGIN DEF_FUN and HIP_OVERLOAD
|
|
||||||
|
|
||||||
// BEGIN DEF_FUN
|
|
||||||
|
|
||||||
#pragma push_macro("__DEF_FUN1")
|
|
||||||
#pragma push_macro("__DEF_FUN2")
|
|
||||||
#pragma push_macro("__DEF_FUN2_FI")
|
|
||||||
|
|
||||||
// Define cmath functions with float argument and returns __retty.
|
|
||||||
#define __DEF_FUN1(__retty, __func) \
|
|
||||||
__DEVICE__ __CONSTEXPR__ __retty __func(float __x) { return __func##f(__x); }
|
|
||||||
|
|
||||||
// Define cmath functions with two float arguments and returns __retty.
|
|
||||||
#define __DEF_FUN2(__retty, __func) \
|
|
||||||
__DEVICE__ __CONSTEXPR__ __retty __func(float __x, float __y) { \
|
|
||||||
return __func##f(__x, __y); \
|
|
||||||
}
|
|
||||||
|
|
||||||
// Define cmath functions with a float and an int argument and returns __retty.
|
|
||||||
#define __DEF_FUN2_FI(__retty, __func) \
|
|
||||||
__DEVICE__ __CONSTEXPR__ __retty __func(float __x, int __y) { \
|
|
||||||
return __func##f(__x, __y); \
|
|
||||||
}
|
|
||||||
|
|
||||||
__DEF_FUN1(float, acos)
|
|
||||||
__DEF_FUN1(float, acosh)
|
|
||||||
__DEF_FUN1(float, asin)
|
|
||||||
__DEF_FUN1(float, asinh)
|
|
||||||
__DEF_FUN1(float, atan)
|
|
||||||
__DEF_FUN2(float, atan2)
|
|
||||||
__DEF_FUN1(float, atanh)
|
|
||||||
__DEF_FUN1(float, cbrt)
|
|
||||||
__DEF_FUN1(float, ceil)
|
|
||||||
__DEF_FUN2(float, copysign)
|
|
||||||
__DEF_FUN1(float, cos)
|
|
||||||
__DEF_FUN1(float, cosh)
|
|
||||||
__DEF_FUN1(float, erf)
|
|
||||||
__DEF_FUN1(float, erfc)
|
|
||||||
__DEF_FUN1(float, exp)
|
|
||||||
__DEF_FUN1(float, exp2)
|
|
||||||
__DEF_FUN1(float, expm1)
|
|
||||||
__DEF_FUN1(float, fabs)
|
|
||||||
__DEF_FUN2(float, fdim)
|
|
||||||
__DEF_FUN1(float, floor)
|
|
||||||
__DEF_FUN2(float, fmax)
|
|
||||||
__DEF_FUN2(float, fmin)
|
|
||||||
__DEF_FUN2(float, fmod)
|
|
||||||
__DEF_FUN2(float, hypot)
|
|
||||||
__DEF_FUN1(int, ilogb)
|
|
||||||
__DEF_FUN2_FI(float, ldexp)
|
|
||||||
__DEF_FUN1(float, lgamma)
|
|
||||||
__DEF_FUN1(float, log)
|
|
||||||
__DEF_FUN1(float, log10)
|
|
||||||
__DEF_FUN1(float, log1p)
|
|
||||||
__DEF_FUN1(float, log2)
|
|
||||||
__DEF_FUN1(float, logb)
|
|
||||||
__DEF_FUN1(long long, llrint)
|
|
||||||
__DEF_FUN1(long long, llround)
|
|
||||||
__DEF_FUN1(long, lrint)
|
|
||||||
__DEF_FUN1(long, lround)
|
|
||||||
__DEF_FUN1(float, nearbyint)
|
|
||||||
__DEF_FUN2(float, nextafter)
|
|
||||||
__DEF_FUN2(float, pow)
|
|
||||||
__DEF_FUN2(float, remainder)
|
|
||||||
__DEF_FUN1(float, rint)
|
|
||||||
__DEF_FUN1(float, round)
|
|
||||||
__DEF_FUN2_FI(float, scalbn)
|
|
||||||
__DEF_FUN1(float, sin)
|
|
||||||
__DEF_FUN1(float, sinh)
|
|
||||||
__DEF_FUN1(float, sqrt)
|
|
||||||
__DEF_FUN1(float, tan)
|
|
||||||
__DEF_FUN1(float, tanh)
|
|
||||||
__DEF_FUN1(float, tgamma)
|
|
||||||
__DEF_FUN1(float, trunc)
|
|
||||||
|
|
||||||
#pragma pop_macro("__DEF_FUN1")
|
|
||||||
#pragma pop_macro("__DEF_FUN2")
|
|
||||||
#pragma pop_macro("__DEF_FUN2_FI")
|
|
||||||
|
|
||||||
// END DEF_FUN
|
|
||||||
|
|
||||||
// BEGIN HIP_OVERLOAD
|
|
||||||
|
|
||||||
#pragma push_macro("__HIP_OVERLOAD1")
|
|
||||||
#pragma push_macro("__HIP_OVERLOAD2")
|
|
||||||
|
|
||||||
// __hip_enable_if::type is a type function which returns __T if __B is true.
|
|
||||||
template <bool __B, class __T = void> struct __hip_enable_if {};
|
|
||||||
|
|
||||||
template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
|
|
||||||
|
|
||||||
namespace __hip {
|
|
||||||
template <class _Tp> struct is_integral {
|
|
||||||
enum { value = 0 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<bool> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<char> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<signed char> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<unsigned char> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<wchar_t> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<short> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<unsigned short> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<int> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<unsigned int> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<long> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<unsigned long> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<long long> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_integral<unsigned long long> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
|
|
||||||
// ToDo: specializes is_arithmetic<_Float16>
|
|
||||||
template <class _Tp> struct is_arithmetic {
|
|
||||||
enum { value = 0 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<bool> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<char> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<signed char> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<unsigned char> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<wchar_t> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<short> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<unsigned short> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<int> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<unsigned int> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<long> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<unsigned long> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<long long> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<unsigned long long> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<float> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
template <> struct is_arithmetic<double> {
|
|
||||||
enum { value = 1 };
|
|
||||||
};
|
|
||||||
|
|
||||||
struct true_type {
|
|
||||||
static const __constant__ bool value = true;
|
|
||||||
};
|
|
||||||
struct false_type {
|
|
||||||
static const __constant__ bool value = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename __T, typename __U> struct is_same : public false_type {};
|
|
||||||
template <typename __T> struct is_same<__T, __T> : public true_type {};
|
|
||||||
|
|
||||||
template <typename __T> struct add_rvalue_reference { typedef __T &&type; };
|
|
||||||
|
|
||||||
template <typename __T> typename add_rvalue_reference<__T>::type declval();
|
|
||||||
|
|
||||||
// decltype is only available in C++11 and above.
|
|
||||||
#if __cplusplus >= 201103L
|
|
||||||
// __hip_promote
|
|
||||||
template <class _Tp> struct __numeric_type {
|
|
||||||
static void __test(...);
|
|
||||||
static _Float16 __test(_Float16);
|
|
||||||
static float __test(float);
|
|
||||||
static double __test(char);
|
|
||||||
static double __test(int);
|
|
||||||
static double __test(unsigned);
|
|
||||||
static double __test(long);
|
|
||||||
static double __test(unsigned long);
|
|
||||||
static double __test(long long);
|
|
||||||
static double __test(unsigned long long);
|
|
||||||
static double __test(double);
|
|
||||||
// No support for long double, use double instead.
|
|
||||||
static double __test(long double);
|
|
||||||
|
|
||||||
template <typename _U>
|
|
||||||
static auto __test_impl(int) -> decltype(__test(declval<_U>()));
|
|
||||||
|
|
||||||
template <typename _U> static void __test_impl(...);
|
|
||||||
|
|
||||||
typedef decltype(__test_impl<_Tp>(0)) type;
|
|
||||||
static const bool value = !is_same<type, void>::value;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <> struct __numeric_type<void> { static const bool value = true; };
|
|
||||||
|
|
||||||
template <class _A1, class _A2 = void, class _A3 = void,
|
|
||||||
bool = __numeric_type<_A1>::value &&__numeric_type<_A2>::value
|
|
||||||
&&__numeric_type<_A3>::value>
|
|
||||||
class __promote_imp {
|
|
||||||
public:
|
|
||||||
static const bool value = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class _A1, class _A2, class _A3>
|
|
||||||
class __promote_imp<_A1, _A2, _A3, true> {
|
|
||||||
private:
|
|
||||||
typedef typename __promote_imp<_A1>::type __type1;
|
|
||||||
typedef typename __promote_imp<_A2>::type __type2;
|
|
||||||
typedef typename __promote_imp<_A3>::type __type3;
|
|
||||||
|
|
||||||
public:
|
|
||||||
typedef decltype(__type1() + __type2() + __type3()) type;
|
|
||||||
static const bool value = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class _A1, class _A2> class __promote_imp<_A1, _A2, void, true> {
|
|
||||||
private:
|
|
||||||
typedef typename __promote_imp<_A1>::type __type1;
|
|
||||||
typedef typename __promote_imp<_A2>::type __type2;
|
|
||||||
|
|
||||||
public:
|
|
||||||
typedef decltype(__type1() + __type2()) type;
|
|
||||||
static const bool value = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class _A1> class __promote_imp<_A1, void, void, true> {
|
|
||||||
public:
|
|
||||||
typedef typename __numeric_type<_A1>::type type;
|
|
||||||
static const bool value = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class _A1, class _A2 = void, class _A3 = void>
|
|
||||||
class __promote : public __promote_imp<_A1, _A2, _A3> {};
|
|
||||||
#endif //__cplusplus >= 201103L
|
|
||||||
} // namespace __hip
|
|
||||||
|
|
||||||
// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
|
|
||||||
// avoid compilation error due to ambiguity. e.g. floor(5) is resolved with
|
|
||||||
// floor(double).
|
|
||||||
#define __HIP_OVERLOAD1(__retty, __fn) \
|
|
||||||
template <typename __T> \
|
|
||||||
__DEVICE__ __CONSTEXPR__ \
|
|
||||||
typename __hip_enable_if<__hip::is_integral<__T>::value, __retty>::type \
|
|
||||||
__fn(__T __x) { \
|
|
||||||
return ::__fn((double)__x); \
|
|
||||||
}
|
|
||||||
|
|
||||||
// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
|
|
||||||
// or integer argument to avoid compilation error due to ambiguity. e.g.
|
|
||||||
// max(5.0f, 6.0) is resolved with max(double, double).
|
|
||||||
#if __cplusplus >= 201103L
|
|
||||||
#define __HIP_OVERLOAD2(__retty, __fn) \
|
|
||||||
template <typename __T1, typename __T2> \
|
|
||||||
__DEVICE__ __CONSTEXPR__ \
|
|
||||||
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && \
|
|
||||||
__hip::is_arithmetic<__T2>::value, \
|
|
||||||
__retty>::type \
|
|
||||||
__fn(__T1 __x, __T2 __y) { \
|
|
||||||
typedef typename __hip::__promote<__T1, __T2>::type __arg_type; \
|
|
||||||
return __fn((__arg_type)__x, (__arg_type)__y); \
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#define __HIP_OVERLOAD2(__retty, __fn) \
|
|
||||||
template <typename __T1, typename __T2> \
|
|
||||||
__DEVICE__ __CONSTEXPR__ \
|
|
||||||
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && \
|
|
||||||
__hip::is_arithmetic<__T2>::value, \
|
|
||||||
__retty>::type \
|
|
||||||
__fn(__T1 __x, __T2 __y) { \
|
|
||||||
return __fn((double)__x, (double)__y); \
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__HIP_OVERLOAD1(double, acos)
|
|
||||||
__HIP_OVERLOAD1(double, acosh)
|
|
||||||
__HIP_OVERLOAD1(double, asin)
|
|
||||||
__HIP_OVERLOAD1(double, asinh)
|
|
||||||
__HIP_OVERLOAD1(double, atan)
|
|
||||||
__HIP_OVERLOAD2(double, atan2)
|
|
||||||
__HIP_OVERLOAD1(double, atanh)
|
|
||||||
__HIP_OVERLOAD1(double, cbrt)
|
|
||||||
__HIP_OVERLOAD1(double, ceil)
|
|
||||||
__HIP_OVERLOAD2(double, copysign)
|
|
||||||
__HIP_OVERLOAD1(double, cos)
|
|
||||||
__HIP_OVERLOAD1(double, cosh)
|
|
||||||
__HIP_OVERLOAD1(double, erf)
|
|
||||||
__HIP_OVERLOAD1(double, erfc)
|
|
||||||
__HIP_OVERLOAD1(double, exp)
|
|
||||||
__HIP_OVERLOAD1(double, exp2)
|
|
||||||
__HIP_OVERLOAD1(double, expm1)
|
|
||||||
__HIP_OVERLOAD1(double, fabs)
|
|
||||||
__HIP_OVERLOAD2(double, fdim)
|
|
||||||
__HIP_OVERLOAD1(double, floor)
|
|
||||||
__HIP_OVERLOAD2(double, fmax)
|
|
||||||
__HIP_OVERLOAD2(double, fmin)
|
|
||||||
__HIP_OVERLOAD2(double, fmod)
|
|
||||||
#if !defined(__HIPCC_RTC__)
|
|
||||||
__HIP_OVERLOAD1(int, fpclassify)
|
|
||||||
#endif // !defined(__HIPCC_RTC__)
|
|
||||||
__HIP_OVERLOAD2(double, hypot)
|
|
||||||
__HIP_OVERLOAD1(int, ilogb)
|
|
||||||
__HIP_OVERLOAD1(bool, isfinite)
|
|
||||||
__HIP_OVERLOAD2(bool, isgreater)
|
|
||||||
__HIP_OVERLOAD2(bool, isgreaterequal)
|
|
||||||
__HIP_OVERLOAD1(bool, isinf)
|
|
||||||
__HIP_OVERLOAD2(bool, isless)
|
|
||||||
__HIP_OVERLOAD2(bool, islessequal)
|
|
||||||
__HIP_OVERLOAD2(bool, islessgreater)
|
|
||||||
__HIP_OVERLOAD1(bool, isnan)
|
|
||||||
__HIP_OVERLOAD1(bool, isnormal)
|
|
||||||
__HIP_OVERLOAD2(bool, isunordered)
|
|
||||||
__HIP_OVERLOAD1(double, lgamma)
|
|
||||||
__HIP_OVERLOAD1(double, log)
|
|
||||||
__HIP_OVERLOAD1(double, log10)
|
|
||||||
__HIP_OVERLOAD1(double, log1p)
|
|
||||||
__HIP_OVERLOAD1(double, log2)
|
|
||||||
__HIP_OVERLOAD1(double, logb)
|
|
||||||
__HIP_OVERLOAD1(long long, llrint)
|
|
||||||
__HIP_OVERLOAD1(long long, llround)
|
|
||||||
__HIP_OVERLOAD1(long, lrint)
|
|
||||||
__HIP_OVERLOAD1(long, lround)
|
|
||||||
__HIP_OVERLOAD1(double, nearbyint)
|
|
||||||
__HIP_OVERLOAD2(double, nextafter)
|
|
||||||
__HIP_OVERLOAD2(double, pow)
|
|
||||||
__HIP_OVERLOAD2(double, remainder)
|
|
||||||
__HIP_OVERLOAD1(double, rint)
|
|
||||||
__HIP_OVERLOAD1(double, round)
|
|
||||||
__HIP_OVERLOAD1(bool, signbit)
|
|
||||||
__HIP_OVERLOAD1(double, sin)
|
|
||||||
__HIP_OVERLOAD1(double, sinh)
|
|
||||||
__HIP_OVERLOAD1(double, sqrt)
|
|
||||||
__HIP_OVERLOAD1(double, tan)
|
|
||||||
__HIP_OVERLOAD1(double, tanh)
|
|
||||||
__HIP_OVERLOAD1(double, tgamma)
|
|
||||||
__HIP_OVERLOAD1(double, trunc)
|
|
||||||
|
|
||||||
// Overload these but don't add them to std, they are not part of cmath.
|
|
||||||
__HIP_OVERLOAD2(double, max)
|
|
||||||
__HIP_OVERLOAD2(double, min)
|
|
||||||
|
|
||||||
// Additional Overloads that don't quite match HIP_OVERLOAD.
|
|
||||||
#if __cplusplus >= 201103L
|
|
||||||
template <typename __T1, typename __T2, typename __T3>
|
|
||||||
__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<
|
|
||||||
__hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
|
|
||||||
__hip::is_arithmetic<__T3>::value,
|
|
||||||
typename __hip::__promote<__T1, __T2, __T3>::type>::type
|
|
||||||
fma(__T1 __x, __T2 __y, __T3 __z) {
|
|
||||||
typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
|
|
||||||
return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
template <typename __T1, typename __T2, typename __T3>
|
|
||||||
__DEVICE__ __CONSTEXPR__
|
|
||||||
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
|
|
||||||
__hip::is_arithmetic<__T2>::value &&
|
|
||||||
__hip::is_arithmetic<__T3>::value,
|
|
||||||
double>::type
|
|
||||||
fma(__T1 __x, __T2 __y, __T3 __z) {
|
|
||||||
return ::fma((double)__x, (double)__y, (double)__z);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template <typename __T>
|
|
||||||
__DEVICE__ __CONSTEXPR__
|
|
||||||
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
|
||||||
frexp(__T __x, int *__exp) {
|
|
||||||
return ::frexp((double)__x, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename __T>
|
|
||||||
__DEVICE__ __CONSTEXPR__
|
|
||||||
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
|
||||||
ldexp(__T __x, int __exp) {
|
|
||||||
return ::ldexp((double)__x, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename __T>
|
|
||||||
__DEVICE__ __CONSTEXPR__
|
|
||||||
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
|
||||||
modf(__T __x, double *__exp) {
|
|
||||||
return ::modf((double)__x, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if __cplusplus >= 201103L
|
|
||||||
template <typename __T1, typename __T2>
|
|
||||||
__DEVICE__ __CONSTEXPR__
|
|
||||||
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
|
|
||||||
__hip::is_arithmetic<__T2>::value,
|
|
||||||
typename __hip::__promote<__T1, __T2>::type>::type
|
|
||||||
remquo(__T1 __x, __T2 __y, int *__quo) {
|
|
||||||
typedef typename __hip::__promote<__T1, __T2>::type __result_type;
|
|
||||||
return ::remquo((__result_type)__x, (__result_type)__y, __quo);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
template <typename __T1, typename __T2>
|
|
||||||
__DEVICE__ __CONSTEXPR__
|
|
||||||
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
|
|
||||||
__hip::is_arithmetic<__T2>::value,
|
|
||||||
double>::type
|
|
||||||
remquo(__T1 __x, __T2 __y, int *__quo) {
|
|
||||||
return ::remquo((double)__x, (double)__y, __quo);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template <typename __T>
|
|
||||||
__DEVICE__ __CONSTEXPR__
|
|
||||||
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
|
||||||
scalbln(__T __x, long int __exp) {
|
|
||||||
return ::scalbln((double)__x, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename __T>
|
|
||||||
__DEVICE__ __CONSTEXPR__
|
|
||||||
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
|
||||||
scalbn(__T __x, int __exp) {
|
|
||||||
return ::scalbn((double)__x, __exp);
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma pop_macro("__HIP_OVERLOAD1")
|
|
||||||
#pragma pop_macro("__HIP_OVERLOAD2")
|
|
||||||
|
|
||||||
// END HIP_OVERLOAD
|
|
||||||
|
|
||||||
// END DEF_FUN and HIP_OVERLOAD
|
|
||||||
|
|
||||||
#endif // ifndef __OPENMP_AMDGCN__
|
|
||||||
#endif // defined(__cplusplus)
|
|
||||||
|
|
||||||
#ifndef __OPENMP_AMDGCN__
|
|
||||||
// Define these overloads inside the namespace our standard library uses.
|
|
||||||
#if !defined(__HIPCC_RTC__)
|
|
||||||
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
|
|
||||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
|
||||||
#else
|
|
||||||
namespace std {
|
|
||||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
_GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
#endif // _LIBCPP_BEGIN_NAMESPACE_STD
|
|
||||||
|
|
||||||
// Pull the new overloads we defined above into namespace std.
|
|
||||||
// using ::abs; - This may be considered for C++.
|
|
||||||
using ::acos;
|
|
||||||
using ::acosh;
|
|
||||||
using ::asin;
|
|
||||||
using ::asinh;
|
|
||||||
using ::atan;
|
|
||||||
using ::atan2;
|
|
||||||
using ::atanh;
|
|
||||||
using ::cbrt;
|
|
||||||
using ::ceil;
|
|
||||||
using ::copysign;
|
|
||||||
using ::cos;
|
|
||||||
using ::cosh;
|
|
||||||
using ::erf;
|
|
||||||
using ::erfc;
|
|
||||||
using ::exp;
|
|
||||||
using ::exp2;
|
|
||||||
using ::expm1;
|
|
||||||
using ::fabs;
|
|
||||||
using ::fdim;
|
|
||||||
using ::floor;
|
|
||||||
using ::fma;
|
|
||||||
using ::fmax;
|
|
||||||
using ::fmin;
|
|
||||||
using ::fmod;
|
|
||||||
using ::fpclassify;
|
|
||||||
using ::frexp;
|
|
||||||
using ::hypot;
|
|
||||||
using ::ilogb;
|
|
||||||
using ::isfinite;
|
|
||||||
using ::isgreater;
|
|
||||||
using ::isgreaterequal;
|
|
||||||
using ::isless;
|
|
||||||
using ::islessequal;
|
|
||||||
using ::islessgreater;
|
|
||||||
using ::isnormal;
|
|
||||||
using ::isunordered;
|
|
||||||
using ::ldexp;
|
|
||||||
using ::lgamma;
|
|
||||||
using ::llrint;
|
|
||||||
using ::llround;
|
|
||||||
using ::log;
|
|
||||||
using ::log10;
|
|
||||||
using ::log1p;
|
|
||||||
using ::log2;
|
|
||||||
using ::logb;
|
|
||||||
using ::lrint;
|
|
||||||
using ::lround;
|
|
||||||
using ::modf;
|
|
||||||
// using ::nan; - This may be considered for C++.
|
|
||||||
// using ::nanf; - This may be considered for C++.
|
|
||||||
// using ::nanl; - This is not yet defined.
|
|
||||||
using ::nearbyint;
|
|
||||||
using ::nextafter;
|
|
||||||
// using ::nexttoward; - Omit this since we do not have a definition.
|
|
||||||
using ::pow;
|
|
||||||
using ::remainder;
|
|
||||||
using ::remquo;
|
|
||||||
using ::rint;
|
|
||||||
using ::round;
|
|
||||||
using ::scalbln;
|
|
||||||
using ::scalbn;
|
|
||||||
using ::signbit;
|
|
||||||
using ::sin;
|
|
||||||
using ::sinh;
|
|
||||||
using ::sqrt;
|
|
||||||
using ::tan;
|
|
||||||
using ::tanh;
|
|
||||||
using ::tgamma;
|
|
||||||
using ::trunc;
|
|
||||||
|
|
||||||
// Well this is fun: We need to pull these symbols in for libc++, but we can't
|
|
||||||
// pull them in with libstdc++, because its ::isinf and ::isnan are different
|
|
||||||
// than its std::isinf and std::isnan.
|
|
||||||
#ifndef __GLIBCXX__
|
|
||||||
using ::isinf;
|
|
||||||
using ::isnan;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Finally, pull the "foobarf" functions that HIP defines into std.
|
|
||||||
using ::acosf;
|
|
||||||
using ::acoshf;
|
|
||||||
using ::asinf;
|
|
||||||
using ::asinhf;
|
|
||||||
using ::atan2f;
|
|
||||||
using ::atanf;
|
|
||||||
using ::atanhf;
|
|
||||||
using ::cbrtf;
|
|
||||||
using ::ceilf;
|
|
||||||
using ::copysignf;
|
|
||||||
using ::cosf;
|
|
||||||
using ::coshf;
|
|
||||||
using ::erfcf;
|
|
||||||
using ::erff;
|
|
||||||
using ::exp2f;
|
|
||||||
using ::expf;
|
|
||||||
using ::expm1f;
|
|
||||||
using ::fabsf;
|
|
||||||
using ::fdimf;
|
|
||||||
using ::floorf;
|
|
||||||
using ::fmaf;
|
|
||||||
using ::fmaxf;
|
|
||||||
using ::fminf;
|
|
||||||
using ::fmodf;
|
|
||||||
using ::frexpf;
|
|
||||||
using ::hypotf;
|
|
||||||
using ::ilogbf;
|
|
||||||
using ::ldexpf;
|
|
||||||
using ::lgammaf;
|
|
||||||
using ::llrintf;
|
|
||||||
using ::llroundf;
|
|
||||||
using ::log10f;
|
|
||||||
using ::log1pf;
|
|
||||||
using ::log2f;
|
|
||||||
using ::logbf;
|
|
||||||
using ::logf;
|
|
||||||
using ::lrintf;
|
|
||||||
using ::lroundf;
|
|
||||||
using ::modff;
|
|
||||||
using ::nearbyintf;
|
|
||||||
using ::nextafterf;
|
|
||||||
// using ::nexttowardf; - Omit this since we do not have a definition.
|
|
||||||
using ::powf;
|
|
||||||
using ::remainderf;
|
|
||||||
using ::remquof;
|
|
||||||
using ::rintf;
|
|
||||||
using ::roundf;
|
|
||||||
using ::scalblnf;
|
|
||||||
using ::scalbnf;
|
|
||||||
using ::sinf;
|
|
||||||
using ::sinhf;
|
|
||||||
using ::sqrtf;
|
|
||||||
using ::tanf;
|
|
||||||
using ::tanhf;
|
|
||||||
using ::tgammaf;
|
|
||||||
using ::truncf;
|
|
||||||
|
|
||||||
#ifdef _LIBCPP_END_NAMESPACE_STD
|
|
||||||
_LIBCPP_END_NAMESPACE_STD
|
|
||||||
#else
|
|
||||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
_GLIBCXX_END_NAMESPACE_VERSION
|
|
||||||
#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
||||||
} // namespace std
|
|
||||||
#endif // _LIBCPP_END_NAMESPACE_STD
|
|
||||||
#endif // !defined(__HIPCC_RTC__)
|
|
||||||
|
|
||||||
// Define device-side math functions from <ymath.h> on MSVC.
|
|
||||||
#if !defined(__HIPCC_RTC__)
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
|
|
||||||
// Before VS2019, `<ymath.h>` is also included in `<limits>` and other headers.
|
|
||||||
// But, from VS2019, it's only included in `<complex>`. Need to include
|
|
||||||
// `<ymath.h>` here to ensure C functions declared there won't be markded as
|
|
||||||
// `__host__` and `__device__` through `<complex>` wrapper.
|
|
||||||
#include <ymath.h>
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
extern "C" {
|
|
||||||
#endif // defined(__cplusplus)
|
|
||||||
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Cosh(double x,
|
|
||||||
double y) {
|
|
||||||
return cosh(x) * y;
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FCosh(float x,
|
|
||||||
float y) {
|
|
||||||
return coshf(x) * y;
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _Dtest(double *p) {
|
|
||||||
return fpclassify(*p);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _FDtest(float *p) {
|
|
||||||
return fpclassify(*p);
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Sinh(double x,
|
|
||||||
double y) {
|
|
||||||
return sinh(x) * y;
|
|
||||||
}
|
|
||||||
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FSinh(float x,
|
|
||||||
float y) {
|
|
||||||
return sinhf(x) * y;
|
|
||||||
}
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
}
|
|
||||||
#endif // defined(__cplusplus)
|
|
||||||
#endif // defined(_MSC_VER)
|
|
||||||
#endif // !defined(__HIPCC_RTC__)
|
|
||||||
#endif // ifndef __OPENMP_AMDGCN__
|
|
||||||
|
|
||||||
#pragma pop_macro("__DEVICE__")
|
|
||||||
#pragma pop_macro("__CONSTEXPR__")
|
|
||||||
|
|
||||||
#endif // __CLANG_HIP_CMATH_H__
|
|
||||||
@@ -1,345 +0,0 @@
|
|||||||
/*===---- __clang_hip_libdevice_declares.h - HIP device library decls -------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
|
|
||||||
#define __CLANG_HIP_LIBDEVICE_DECLARES_H__
|
|
||||||
|
|
||||||
#if !defined(__HIPCC_RTC__) && __has_include("hip/hip_version.h")
|
|
||||||
#include "hip/hip_version.h"
|
|
||||||
#endif // __has_include("hip/hip_version.h")
|
|
||||||
|
|
||||||
#define __PRIVATE_AS __attribute__((opencl_private))
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// BEGIN FLOAT
|
|
||||||
__device__ __attribute__((const)) float __ocml_acos_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_asin_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_atan_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_ceil_f32(float);
|
|
||||||
__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float,
|
|
||||||
float);
|
|
||||||
__device__ float __ocml_cos_f32(float);
|
|
||||||
__device__ float __ocml_native_cos_f32(float);
|
|
||||||
__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
|
|
||||||
__device__ float __ocml_cospi_f32(float);
|
|
||||||
__device__ float __ocml_i0_f32(float);
|
|
||||||
__device__ float __ocml_i1_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_erf_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_exp_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_fabs_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_floor_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float,
|
|
||||||
float);
|
|
||||||
__device__ float __ocml_frexp_f32(float, __PRIVATE_AS int *);
|
|
||||||
__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
|
|
||||||
__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
|
|
||||||
__device__ __attribute__((const)) int __ocml_isinf_f32(float);
|
|
||||||
__device__ __attribute__((const)) int __ocml_isnan_f32(float);
|
|
||||||
__device__ float __ocml_j0_f32(float);
|
|
||||||
__device__ float __ocml_j1_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
|
|
||||||
__device__ float __ocml_lgamma_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_log10_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_log2_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_logb_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_log_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
|
|
||||||
__device__ float __ocml_modf_f32(float, __PRIVATE_AS float *);
|
|
||||||
__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
|
|
||||||
float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_pown_f32(float, int);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
|
|
||||||
__device__ float __ocml_remquo_f32(float, float, __PRIVATE_AS int *);
|
|
||||||
__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_rint_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
|
|
||||||
float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_round_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
|
|
||||||
__device__ __attribute__((const)) int __ocml_signbit_f32(float);
|
|
||||||
__device__ float __ocml_sincos_f32(float, __PRIVATE_AS float *);
|
|
||||||
__device__ float __ocml_sincospi_f32(float, __PRIVATE_AS float *);
|
|
||||||
__device__ float __ocml_sin_f32(float);
|
|
||||||
__device__ float __ocml_native_sin_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
|
|
||||||
__device__ float __ocml_sinpi_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
|
|
||||||
__device__ float __ocml_tan_f32(float);
|
|
||||||
__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
|
|
||||||
__device__ float __ocml_tgamma_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_trunc_f32(float);
|
|
||||||
__device__ float __ocml_y0_f32(float);
|
|
||||||
__device__ float __ocml_y1_f32(float);
|
|
||||||
|
|
||||||
// BEGIN INTRINSICS
|
|
||||||
__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
|
|
||||||
__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
|
|
||||||
// END INTRINSICS
|
|
||||||
// END FLOAT
|
|
||||||
|
|
||||||
// BEGIN DOUBLE
|
|
||||||
__device__ __attribute__((const)) double __ocml_acos_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_asin_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_atan_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_ceil_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
|
|
||||||
__device__ double __ocml_cos_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
|
|
||||||
__device__ double __ocml_cospi_f64(double);
|
|
||||||
__device__ double __ocml_i0_f64(double);
|
|
||||||
__device__ double __ocml_i1_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_erf_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_exp_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_fabs_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_floor_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
|
|
||||||
__device__ double __ocml_frexp_f64(double, __PRIVATE_AS int *);
|
|
||||||
__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
|
|
||||||
__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
|
|
||||||
__device__ __attribute__((const)) int __ocml_isinf_f64(double);
|
|
||||||
__device__ __attribute__((const)) int __ocml_isnan_f64(double);
|
|
||||||
__device__ double __ocml_j0_f64(double);
|
|
||||||
__device__ double __ocml_j1_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
|
|
||||||
__device__ double __ocml_lgamma_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_log10_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_log2_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_logb_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_log_f64(double);
|
|
||||||
__device__ double __ocml_modf_f64(double, __PRIVATE_AS double *);
|
|
||||||
__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_len3_f64(double, double,
|
|
||||||
double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
|
|
||||||
double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_pown_f64(double, int);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
|
|
||||||
__device__ double __ocml_remquo_f64(double, double, __PRIVATE_AS int *);
|
|
||||||
__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_rint_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double,
|
|
||||||
double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double,
|
|
||||||
double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_round_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
|
|
||||||
__device__ __attribute__((const)) int __ocml_signbit_f64(double);
|
|
||||||
__device__ double __ocml_sincos_f64(double, __PRIVATE_AS double *);
|
|
||||||
__device__ double __ocml_sincospi_f64(double, __PRIVATE_AS double *);
|
|
||||||
__device__ double __ocml_sin_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
|
|
||||||
__device__ double __ocml_sinpi_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
|
|
||||||
__device__ double __ocml_tan_f64(double);
|
|
||||||
__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
|
|
||||||
__device__ double __ocml_tgamma_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_trunc_f64(double);
|
|
||||||
__device__ double __ocml_y0_f64(double);
|
|
||||||
__device__ double __ocml_y1_f64(double);
|
|
||||||
|
|
||||||
// BEGIN INTRINSICS
|
|
||||||
__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
|
|
||||||
double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
|
|
||||||
double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
|
|
||||||
double);
|
|
||||||
__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
|
|
||||||
double);
|
|
||||||
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
|
|
||||||
__device__ _Float16 __ocml_cos_f16(_Float16);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
|
|
||||||
__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
|
|
||||||
__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
|
|
||||||
__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
|
|
||||||
_Float16);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
|
|
||||||
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
|
|
||||||
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
|
|
||||||
__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
|
|
||||||
__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
|
|
||||||
__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
|
|
||||||
__device__ _Float16 __ocml_sin_f16(_Float16);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
|
|
||||||
__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
|
|
||||||
__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
|
|
||||||
|
|
||||||
typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
|
|
||||||
typedef short __2i16 __attribute__((ext_vector_type(2)));
|
|
||||||
|
|
||||||
// We need to match C99's bool and get an i1 in the IR.
|
|
||||||
#ifdef __cplusplus
|
|
||||||
typedef bool __ockl_bool;
|
|
||||||
#else
|
|
||||||
typedef _Bool __ockl_bool;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
|
|
||||||
float c, __ockl_bool s);
|
|
||||||
__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
|
|
||||||
__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
|
|
||||||
__device__ __2f16 __ocml_cos_2f16(__2f16);
|
|
||||||
__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
|
|
||||||
__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
|
|
||||||
__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
|
|
||||||
__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
|
|
||||||
__device__ __attribute__((const))
|
|
||||||
__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
|
|
||||||
__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
|
|
||||||
__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
|
|
||||||
__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
|
|
||||||
__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
|
|
||||||
__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
|
|
||||||
|
|
||||||
#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 560
|
|
||||||
#define __DEPRECATED_SINCE_HIP_560(X) __attribute__((deprecated(X)))
|
|
||||||
#else
|
|
||||||
#define __DEPRECATED_SINCE_HIP_560(X)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Deprecated, should be removed when rocm releases using it are no longer
|
|
||||||
// relevant.
|
|
||||||
__DEPRECATED_SINCE_HIP_560("use ((_Float16)1.0) / ")
|
|
||||||
__device__ inline _Float16 __llvm_amdgcn_rcp_f16(_Float16 x) {
|
|
||||||
return ((_Float16)1.0f) / x;
|
|
||||||
}
|
|
||||||
|
|
||||||
__DEPRECATED_SINCE_HIP_560("use ((__2f16)1.0) / ")
|
|
||||||
__device__ inline __2f16
|
|
||||||
__llvm_amdgcn_rcp_2f16(__2f16 __x)
|
|
||||||
{
|
|
||||||
return ((__2f16)1.0f) / __x;
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEPRECATED_SINCE_HIP_560
|
|
||||||
|
|
||||||
__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
|
|
||||||
__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
|
|
||||||
__device__ __2f16 __ocml_sin_2f16(__2f16);
|
|
||||||
__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
|
|
||||||
__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
|
|
||||||
__device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
} // extern "C"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,162 +0,0 @@
|
|||||||
/*===---- __clang_hip_runtime_wrapper.h - HIP runtime support ---------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* WARNING: This header is intended to be directly -include'd by
|
|
||||||
* the compiler and is not supposed to be included by users.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __CLANG_HIP_RUNTIME_WRAPPER_H__
|
|
||||||
#define __CLANG_HIP_RUNTIME_WRAPPER_H__
|
|
||||||
|
|
||||||
#if __HIP__
|
|
||||||
|
|
||||||
#define __host__ __attribute__((host))
|
|
||||||
#define __device__ __attribute__((device))
|
|
||||||
#define __global__ __attribute__((global))
|
|
||||||
#define __shared__ __attribute__((shared))
|
|
||||||
#define __constant__ __attribute__((constant))
|
|
||||||
#define __managed__ __attribute__((managed))
|
|
||||||
|
|
||||||
#if !defined(__cplusplus) || __cplusplus < 201103L
|
|
||||||
#define nullptr NULL;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
__attribute__((__visibility__("default")))
|
|
||||||
__attribute__((weak))
|
|
||||||
__attribute__((noreturn))
|
|
||||||
__device__ void __cxa_pure_virtual(void) {
|
|
||||||
__builtin_trap();
|
|
||||||
}
|
|
||||||
__attribute__((__visibility__("default")))
|
|
||||||
__attribute__((weak))
|
|
||||||
__attribute__((noreturn))
|
|
||||||
__device__ void __cxa_deleted_virtual(void) {
|
|
||||||
__builtin_trap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif //__cplusplus
|
|
||||||
|
|
||||||
#if !defined(__HIPCC_RTC__)
|
|
||||||
#if __has_include("hip/hip_version.h")
|
|
||||||
#include "hip/hip_version.h"
|
|
||||||
#endif // __has_include("hip/hip_version.h")
|
|
||||||
#endif // __HIPCC_RTC__
|
|
||||||
|
|
||||||
typedef __SIZE_TYPE__ __hip_size_t;
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif //__cplusplus
|
|
||||||
|
|
||||||
#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
|
|
||||||
__device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
|
|
||||||
__device__ void __ockl_dm_dealloc(unsigned long long __addr);
|
|
||||||
#if __has_feature(address_sanitizer)
|
|
||||||
__device__ unsigned long long __asan_malloc_impl(unsigned long long __size,
|
|
||||||
unsigned long long __pc);
|
|
||||||
__device__ void __asan_free_impl(unsigned long long __addr,
|
|
||||||
unsigned long long __pc);
|
|
||||||
__attribute__((noinline, weak)) __device__ void *malloc(__hip_size_t __size) {
|
|
||||||
unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
|
|
||||||
return (void *)__asan_malloc_impl(__size, __pc);
|
|
||||||
}
|
|
||||||
__attribute__((noinline, weak)) __device__ void free(void *__ptr) {
|
|
||||||
unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
|
|
||||||
__asan_free_impl((unsigned long long)__ptr, __pc);
|
|
||||||
}
|
|
||||||
#else // __has_feature(address_sanitizer)
|
|
||||||
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
|
|
||||||
return (void *) __ockl_dm_alloc(__size);
|
|
||||||
}
|
|
||||||
__attribute__((weak)) inline __device__ void free(void *__ptr) {
|
|
||||||
__ockl_dm_dealloc((unsigned long long)__ptr);
|
|
||||||
}
|
|
||||||
#endif // __has_feature(address_sanitizer)
|
|
||||||
#else // HIP version check
|
|
||||||
#if __HIP_ENABLE_DEVICE_MALLOC__
|
|
||||||
__device__ void *__hip_malloc(__hip_size_t __size);
|
|
||||||
__device__ void *__hip_free(void *__ptr);
|
|
||||||
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
|
|
||||||
return __hip_malloc(__size);
|
|
||||||
}
|
|
||||||
__attribute__((weak)) inline __device__ void free(void *__ptr) {
|
|
||||||
__hip_free(__ptr);
|
|
||||||
}
|
|
||||||
#else // __HIP_ENABLE_DEVICE_MALLOC__
|
|
||||||
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
|
|
||||||
__builtin_trap();
|
|
||||||
return (void *)0;
|
|
||||||
}
|
|
||||||
__attribute__((weak)) inline __device__ void free(void *__ptr) {
|
|
||||||
__builtin_trap();
|
|
||||||
}
|
|
||||||
#endif // __HIP_ENABLE_DEVICE_MALLOC__
|
|
||||||
#endif // HIP version check
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
} // extern "C"
|
|
||||||
#endif //__cplusplus
|
|
||||||
|
|
||||||
#if !defined(__HIPCC_RTC__)
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#if __has_include("hip/hip_version.h")
|
|
||||||
#include "hip/hip_version.h"
|
|
||||||
#endif // __has_include("hip/hip_version.h")
|
|
||||||
#else
|
|
||||||
typedef __SIZE_TYPE__ size_t;
|
|
||||||
// Define macros which are needed to declare HIP device API's without standard
|
|
||||||
// C/C++ headers. This is for readability so that these API's can be written
|
|
||||||
// the same way as non-hipRTC use case. These macros need to be popped so that
|
|
||||||
// they do not pollute users' name space.
|
|
||||||
#pragma push_macro("NULL")
|
|
||||||
#pragma push_macro("uint32_t")
|
|
||||||
#pragma push_macro("uint64_t")
|
|
||||||
#pragma push_macro("CHAR_BIT")
|
|
||||||
#pragma push_macro("INT_MAX")
|
|
||||||
#pragma push_macro("INT_MIN")
|
|
||||||
#define NULL (void *)0
|
|
||||||
#define uint32_t __UINT32_TYPE__
|
|
||||||
#define uint64_t __UINT64_TYPE__
|
|
||||||
#define CHAR_BIT __CHAR_BIT__
|
|
||||||
#define INT_MAX __INTMAX_MAX__
|
|
||||||
#define INT_MIN (-__INT_MAX__ - 1)
|
|
||||||
#endif // __HIPCC_RTC__
|
|
||||||
|
|
||||||
#include <__clang_hip_libdevice_declares.h>
|
|
||||||
#include <__clang_hip_math.h>
|
|
||||||
#include <__clang_hip_stdlib.h>
|
|
||||||
|
|
||||||
#if defined(__HIPCC_RTC__)
|
|
||||||
#include <__clang_hip_cmath.h>
|
|
||||||
#else
|
|
||||||
#include <__clang_cuda_math_forward_declares.h>
|
|
||||||
#include <__clang_hip_cmath.h>
|
|
||||||
#include <__clang_cuda_complex_builtins.h>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <complex>
|
|
||||||
#include <new>
|
|
||||||
#endif // __HIPCC_RTC__
|
|
||||||
|
|
||||||
#define __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 1
|
|
||||||
#if defined(__HIPCC_RTC__)
|
|
||||||
#pragma pop_macro("NULL")
|
|
||||||
#pragma pop_macro("uint32_t")
|
|
||||||
#pragma pop_macro("uint64_t")
|
|
||||||
#pragma pop_macro("CHAR_BIT")
|
|
||||||
#pragma pop_macro("INT_MAX")
|
|
||||||
#pragma pop_macro("INT_MIN")
|
|
||||||
#endif // __HIPCC_RTC__
|
|
||||||
#endif // __HIP__
|
|
||||||
#endif // __CLANG_HIP_RUNTIME_WRAPPER_H__
|
|
||||||
@@ -1,43 +0,0 @@
|
|||||||
/*===---- __clang_hip_stdlib.h - Device-side HIP math support --------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __CLANG_HIP_STDLIB_H__
|
|
||||||
|
|
||||||
#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
|
|
||||||
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(__cplusplus)
|
|
||||||
|
|
||||||
#include <limits.h>
|
|
||||||
|
|
||||||
#ifdef __OPENMP_AMDGCN__
|
|
||||||
#define __DEVICE__ static inline __attribute__((always_inline, nothrow))
|
|
||||||
#else
|
|
||||||
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__DEVICE__
|
|
||||||
int abs(int __x) {
|
|
||||||
int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
|
|
||||||
return (__x ^ __sgn) - __sgn;
|
|
||||||
}
|
|
||||||
__DEVICE__
|
|
||||||
long labs(long __x) {
|
|
||||||
long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
|
|
||||||
return (__x ^ __sgn) - __sgn;
|
|
||||||
}
|
|
||||||
__DEVICE__
|
|
||||||
long long llabs(long long __x) {
|
|
||||||
long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
|
|
||||||
return (__x ^ __sgn) - __sgn;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // !defined(__cplusplus)
|
|
||||||
|
|
||||||
#endif // #define __CLANG_HIP_STDLIB_H__
|
|
||||||
@@ -1,217 +0,0 @@
|
|||||||
/*===---- spirv_builtin_vars.h - SPIR-V built-in ---------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __SPIRV_BUILTIN_VARS_H
|
|
||||||
#define __SPIRV_BUILTIN_VARS_H
|
|
||||||
|
|
||||||
#if __cplusplus >= 201103L
|
|
||||||
#define __SPIRV_NOEXCEPT noexcept
|
|
||||||
#else
|
|
||||||
#define __SPIRV_NOEXCEPT
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#pragma push_macro("__size_t")
|
|
||||||
#pragma push_macro("__uint32_t")
|
|
||||||
#pragma push_macro("__uint64_t")
|
|
||||||
#define __size_t __SIZE_TYPE__
|
|
||||||
#define __uint32_t __UINT32_TYPE__
|
|
||||||
|
|
||||||
#define __SPIRV_overloadable __attribute__((overloadable))
|
|
||||||
#define __SPIRV_convergent __attribute__((convergent))
|
|
||||||
#define __SPIRV_inline __attribute__((always_inline))
|
|
||||||
|
|
||||||
#define __global __attribute__((opencl_global))
|
|
||||||
#define __local __attribute__((opencl_local))
|
|
||||||
#define __private __attribute__((opencl_private))
|
|
||||||
#define __constant __attribute__((opencl_constant))
|
|
||||||
#ifdef __SYCL_DEVICE_ONLY__
|
|
||||||
#define __generic
|
|
||||||
#else
|
|
||||||
#define __generic __attribute__((opencl_generic))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Check if SPIR-V builtins are supported.
|
|
||||||
// As the translator doesn't use the LLVM intrinsics (which would be emitted if
|
|
||||||
// we use the SPIR-V builtins) we can't rely on the SPIRV32/SPIRV64 etc macros
|
|
||||||
// to establish if we can use the builtin alias. We disable builtin altogether
|
|
||||||
// if we do not intent to use the backend. So instead of use target macros, rely
|
|
||||||
// on a __has_builtin test.
|
|
||||||
#if (__has_builtin(__builtin_spirv_num_workgroups))
|
|
||||||
#define __SPIRV_BUILTIN_ALIAS(builtin) \
|
|
||||||
__attribute__((clang_builtin_alias(builtin)))
|
|
||||||
#else
|
|
||||||
#define __SPIRV_BUILTIN_ALIAS(builtin)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Builtin IDs and sizes
|
|
||||||
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_num_workgroups) __size_t
|
|
||||||
__spirv_NumWorkgroups(int);
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_workgroup_size) __size_t
|
|
||||||
__spirv_WorkgroupSize(int);
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_workgroup_id) __size_t
|
|
||||||
__spirv_WorkgroupId(int);
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_local_invocation_id) __size_t
|
|
||||||
__spirv_LocalInvocationId(int);
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_invocation_id) __size_t
|
|
||||||
__spirv_GlobalInvocationId(int);
|
|
||||||
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_size) __size_t
|
|
||||||
__spirv_GlobalSize(int);
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_offset) __size_t
|
|
||||||
__spirv_GlobalOffset(int);
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_size) __uint32_t
|
|
||||||
__spirv_SubgroupSize();
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_max_size) __uint32_t
|
|
||||||
__spirv_SubgroupMaxSize();
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_num_subgroups) __uint32_t
|
|
||||||
__spirv_NumSubgroups();
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_id) __uint32_t
|
|
||||||
__spirv_SubgroupId();
|
|
||||||
extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_local_invocation_id)
|
|
||||||
__uint32_t __spirv_SubgroupLocalInvocationId();
|
|
||||||
|
|
||||||
// OpGenericCastToPtrExplicit
|
|
||||||
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__global void *__spirv_GenericCastToPtrExplicit_ToGlobal(__generic void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__global const void *
|
|
||||||
__spirv_GenericCastToPtrExplicit_ToGlobal(__generic const void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__global volatile void *
|
|
||||||
__spirv_GenericCastToPtrExplicit_ToGlobal(__generic volatile void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__global const volatile void *
|
|
||||||
__spirv_GenericCastToPtrExplicit_ToGlobal(__generic const volatile void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__local void *__spirv_GenericCastToPtrExplicit_ToLocal(__generic void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__local const void *
|
|
||||||
__spirv_GenericCastToPtrExplicit_ToLocal(__generic const void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__local volatile void *
|
|
||||||
__spirv_GenericCastToPtrExplicit_ToLocal(__generic volatile void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__local const volatile void *
|
|
||||||
__spirv_GenericCastToPtrExplicit_ToLocal(__generic const volatile void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__private void *
|
|
||||||
__spirv_GenericCastToPtrExplicit_ToPrivate(__generic void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__private const void *
|
|
||||||
__spirv_GenericCastToPtrExplicit_ToPrivate(__generic const void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__private volatile void *
|
|
||||||
__spirv_GenericCastToPtrExplicit_ToPrivate(__generic volatile void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
extern __SPIRV_overloadable
|
|
||||||
__SPIRV_BUILTIN_ALIAS(__builtin_spirv_generic_cast_to_ptr_explicit)
|
|
||||||
__private const volatile void *
|
|
||||||
__spirv_GenericCastToPtrExplicit_ToPrivate(__generic const volatile void *,
|
|
||||||
int) __SPIRV_NOEXCEPT;
|
|
||||||
|
|
||||||
// OpGenericCastToPtr
|
|
||||||
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __global void *
|
|
||||||
__spirv_GenericCastToPtr_ToGlobal(__generic void *p, int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__global void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __global const void *
|
|
||||||
__spirv_GenericCastToPtr_ToGlobal(__generic const void *p,
|
|
||||||
int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__global const void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __global volatile void *
|
|
||||||
__spirv_GenericCastToPtr_ToGlobal(__generic volatile void *p,
|
|
||||||
int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__global volatile void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __global const volatile void *
|
|
||||||
__spirv_GenericCastToPtr_ToGlobal(__generic const volatile void *p,
|
|
||||||
int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__global const volatile void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __local void *
|
|
||||||
__spirv_GenericCastToPtr_ToLocal(__generic void *p, int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__local void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __local const void *
|
|
||||||
__spirv_GenericCastToPtr_ToLocal(__generic const void *p,
|
|
||||||
int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__local const void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __local volatile void *
|
|
||||||
__spirv_GenericCastToPtr_ToLocal(__generic volatile void *p,
|
|
||||||
int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__local volatile void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __local const volatile void *
|
|
||||||
__spirv_GenericCastToPtr_ToLocal(__generic const volatile void *p,
|
|
||||||
int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__local const volatile void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __private void *
|
|
||||||
__spirv_GenericCastToPtr_ToPrivate(__generic void *p, int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__private void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __private const void *
|
|
||||||
__spirv_GenericCastToPtr_ToPrivate(__generic const void *p,
|
|
||||||
int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__private const void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __private volatile void *
|
|
||||||
__spirv_GenericCastToPtr_ToPrivate(__generic volatile void *p,
|
|
||||||
int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__private volatile void *)p;
|
|
||||||
}
|
|
||||||
static __SPIRV_overloadable __SPIRV_inline __private const volatile void *
|
|
||||||
__spirv_GenericCastToPtr_ToPrivate(__generic const volatile void *p,
|
|
||||||
int) __SPIRV_NOEXCEPT {
|
|
||||||
return (__private const volatile void *)p;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma pop_macro("__size_t")
|
|
||||||
#pragma pop_macro("__uint32_t")
|
|
||||||
#pragma pop_macro("__uint64_t")
|
|
||||||
|
|
||||||
#undef __SPIRV_overloadable
|
|
||||||
#undef __SPIRV_convergent
|
|
||||||
#undef __SPIRV_inline
|
|
||||||
|
|
||||||
#undef __global
|
|
||||||
#undef __local
|
|
||||||
#undef __constant
|
|
||||||
#undef __generic
|
|
||||||
|
|
||||||
#undef __SPIRV_BUILTIN_ALIAS
|
|
||||||
#undef __SPIRV_NOEXCEPT
|
|
||||||
|
|
||||||
#endif /* __SPIRV_BUILTIN_VARS_H */
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
/*===---- __stdarg___gnuc_va_list.h - Definition of __gnuc_va_list ---------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __GNUC_VA_LIST
|
|
||||||
#define __GNUC_VA_LIST
|
|
||||||
typedef __builtin_va_list __gnuc_va_list;
|
|
||||||
#endif
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
/*===---- __stdarg___va_copy.h - Definition of __va_copy -------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __va_copy
|
|
||||||
#define __va_copy(d, s) __builtin_va_copy(d, s)
|
|
||||||
#endif
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
/*===---- __stdarg_header_macro.h ------------------------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __STDARG_H
|
|
||||||
#define __STDARG_H
|
|
||||||
#endif
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
/*===---- __stdarg_va_arg.h - Definitions of va_start, va_arg, va_end-------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef va_arg
|
|
||||||
|
|
||||||
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
|
|
||||||
/* C23 uses a special builtin. */
|
|
||||||
#define va_start(...) __builtin_c23_va_start(__VA_ARGS__)
|
|
||||||
#else
|
|
||||||
/* Versions before C23 do require the second parameter. */
|
|
||||||
#define va_start(ap, param) __builtin_va_start(ap, param)
|
|
||||||
#endif
|
|
||||||
#define va_end(ap) __builtin_va_end(ap)
|
|
||||||
#define va_arg(ap, type) __builtin_va_arg(ap, type)
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
/*===---- __stdarg_va_copy.h - Definition of va_copy------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef va_copy
|
|
||||||
#define va_copy(dest, src) __builtin_va_copy(dest, src)
|
|
||||||
#endif
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
/*===---- __stdarg_va_list.h - Definition of va_list -----------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef _VA_LIST
|
|
||||||
#define _VA_LIST
|
|
||||||
typedef __builtin_va_list va_list;
|
|
||||||
#endif
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
/*===---- __stddef_header_macro.h ------------------------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __STDDEF_H
|
|
||||||
#define __STDDEF_H
|
|
||||||
#endif
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
/*===---- __stddef_max_align_t.h - Definition of max_align_t ---------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __CLANG_MAX_ALIGN_T_DEFINED
|
|
||||||
#define __CLANG_MAX_ALIGN_T_DEFINED
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
typedef double max_align_t;
|
|
||||||
#elif defined(__APPLE__)
|
|
||||||
typedef long double max_align_t;
|
|
||||||
#else
|
|
||||||
// Define 'max_align_t' to match the GCC definition.
|
|
||||||
typedef struct {
|
|
||||||
long long __clang_max_align_nonce1
|
|
||||||
__attribute__((__aligned__(__alignof__(long long))));
|
|
||||||
long double __clang_max_align_nonce2
|
|
||||||
__attribute__((__aligned__(__alignof__(long double))));
|
|
||||||
} max_align_t;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
/*===---- __stddef_null.h - Definition of NULL -----------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#if !defined(NULL) || !__building_module(_Builtin_stddef)
|
|
||||||
|
|
||||||
/* linux/stddef.h will define NULL to 0. glibc (and other) headers then define
|
|
||||||
* __need_NULL and rely on stddef.h to redefine NULL to the correct value again.
|
|
||||||
* Modules don't support redefining macros like that, but support that pattern
|
|
||||||
* in the non-modules case.
|
|
||||||
*/
|
|
||||||
#undef NULL
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
#if !defined(__MINGW32__) && !defined(_MSC_VER)
|
|
||||||
#define NULL __null
|
|
||||||
#else
|
|
||||||
#define NULL 0
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define NULL ((void*)0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
/*===---- __stddef_nullptr_t.h - Definition of nullptr_t -------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When -fbuiltin-headers-in-system-modules is set this is a non-modular header
|
|
||||||
* and needs to behave as if it was textual.
|
|
||||||
*/
|
|
||||||
#if !defined(_NULLPTR_T) || \
|
|
||||||
(__has_feature(modules) && !__building_module(_Builtin_stddef))
|
|
||||||
#define _NULLPTR_T
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
|
|
||||||
namespace std {
|
|
||||||
typedef decltype(nullptr) nullptr_t;
|
|
||||||
}
|
|
||||||
using ::std::nullptr_t;
|
|
||||||
#endif
|
|
||||||
#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
|
|
||||||
typedef typeof(nullptr) nullptr_t;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
/*===---- __stddef_offsetof.h - Definition of offsetof ---------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When -fbuiltin-headers-in-system-modules is set this is a non-modular header
|
|
||||||
* and needs to behave as if it was textual.
|
|
||||||
*/
|
|
||||||
#if !defined(offsetof) || \
|
|
||||||
(__has_feature(modules) && !__building_module(_Builtin_stddef))
|
|
||||||
#define offsetof(t, d) __builtin_offsetof(t, d)
|
|
||||||
#endif
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
/*===---- __stddef_ptrdiff_t.h - Definition of ptrdiff_t -------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When -fbuiltin-headers-in-system-modules is set this is a non-modular header
|
|
||||||
* and needs to behave as if it was textual.
|
|
||||||
*/
|
|
||||||
#if !defined(_PTRDIFF_T) || \
|
|
||||||
(__has_feature(modules) && !__building_module(_Builtin_stddef))
|
|
||||||
#define _PTRDIFF_T
|
|
||||||
|
|
||||||
typedef __PTRDIFF_TYPE__ ptrdiff_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
/*===---- __stddef_rsize_t.h - Definition of rsize_t -----------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When -fbuiltin-headers-in-system-modules is set this is a non-modular header
|
|
||||||
* and needs to behave as if it was textual.
|
|
||||||
*/
|
|
||||||
#if !defined(_RSIZE_T) || \
|
|
||||||
(__has_feature(modules) && !__building_module(_Builtin_stddef))
|
|
||||||
#define _RSIZE_T
|
|
||||||
|
|
||||||
typedef __SIZE_TYPE__ rsize_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
/*===---- __stddef_size_t.h - Definition of size_t -------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When -fbuiltin-headers-in-system-modules is set this is a non-modular header
|
|
||||||
* and needs to behave as if it was textual.
|
|
||||||
*/
|
|
||||||
#if !defined(_SIZE_T) || \
|
|
||||||
(__has_feature(modules) && !__building_module(_Builtin_stddef))
|
|
||||||
#define _SIZE_T
|
|
||||||
|
|
||||||
typedef __SIZE_TYPE__ size_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
/*===---- __stddef_unreachable.h - Definition of unreachable ---------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __cplusplus
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When -fbuiltin-headers-in-system-modules is set this is a non-modular header
|
|
||||||
* and needs to behave as if it was textual.
|
|
||||||
*/
|
|
||||||
#if !defined(unreachable) || \
|
|
||||||
(__has_feature(modules) && !__building_module(_Builtin_stddef))
|
|
||||||
#define unreachable() __builtin_unreachable()
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
/*===---- __stddef_wchar.h - Definition of wchar_t -------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When -fbuiltin-headers-in-system-modules is set this is a non-modular header
|
|
||||||
* and needs to behave as if it was textual.
|
|
||||||
*/
|
|
||||||
#if !defined(_WCHAR_T) || \
|
|
||||||
(__has_feature(modules) && !__building_module(_Builtin_stddef))
|
|
||||||
#define _WCHAR_T
|
|
||||||
|
|
||||||
#ifdef _MSC_EXTENSIONS
|
|
||||||
#define _WCHAR_T_DEFINED
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef __WCHAR_TYPE__ wchar_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
/*===---- __stddef_wint.h - Definition of wint_t ---------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef _WINT_T
|
|
||||||
#define _WINT_T
|
|
||||||
|
|
||||||
typedef __WINT_TYPE__ wint_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,140 +0,0 @@
|
|||||||
/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __WMMINTRIN_H
|
|
||||||
#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __WMMINTRIN_AES_H
|
|
||||||
#define __WMMINTRIN_AES_H
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
|
|
||||||
|
|
||||||
/// Performs a single round of AES encryption using the Equivalent
|
|
||||||
/// Inverse Cipher, transforming the state value from the first source
|
|
||||||
/// operand using a 128-bit round key value contained in the second source
|
|
||||||
/// operand, and writes the result to the destination.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __V
|
|
||||||
/// A 128-bit integer vector containing the state value.
|
|
||||||
/// \param __R
|
|
||||||
/// A 128-bit integer vector containing the round key value.
|
|
||||||
/// \returns A 128-bit integer vector containing the encrypted value.
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
||||||
_mm_aesenc_si128(__m128i __V, __m128i __R)
|
|
||||||
{
|
|
||||||
return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Performs the final round of AES encryption using the Equivalent
|
|
||||||
/// Inverse Cipher, transforming the state value from the first source
|
|
||||||
/// operand using a 128-bit round key value contained in the second source
|
|
||||||
/// operand, and writes the result to the destination.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __V
|
|
||||||
/// A 128-bit integer vector containing the state value.
|
|
||||||
/// \param __R
|
|
||||||
/// A 128-bit integer vector containing the round key value.
|
|
||||||
/// \returns A 128-bit integer vector containing the encrypted value.
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
||||||
_mm_aesenclast_si128(__m128i __V, __m128i __R)
|
|
||||||
{
|
|
||||||
return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Performs a single round of AES decryption using the Equivalent
|
|
||||||
/// Inverse Cipher, transforming the state value from the first source
|
|
||||||
/// operand using a 128-bit round key value contained in the second source
|
|
||||||
/// operand, and writes the result to the destination.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __V
|
|
||||||
/// A 128-bit integer vector containing the state value.
|
|
||||||
/// \param __R
|
|
||||||
/// A 128-bit integer vector containing the round key value.
|
|
||||||
/// \returns A 128-bit integer vector containing the decrypted value.
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
||||||
_mm_aesdec_si128(__m128i __V, __m128i __R)
|
|
||||||
{
|
|
||||||
return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Performs the final round of AES decryption using the Equivalent
|
|
||||||
/// Inverse Cipher, transforming the state value from the first source
|
|
||||||
/// operand using a 128-bit round key value contained in the second source
|
|
||||||
/// operand, and writes the result to the destination.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __V
|
|
||||||
/// A 128-bit integer vector containing the state value.
|
|
||||||
/// \param __R
|
|
||||||
/// A 128-bit integer vector containing the round key value.
|
|
||||||
/// \returns A 128-bit integer vector containing the decrypted value.
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
||||||
_mm_aesdeclast_si128(__m128i __V, __m128i __R)
|
|
||||||
{
|
|
||||||
return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Applies the AES InvMixColumns() transformation to an expanded key
|
|
||||||
/// contained in the source operand, and writes the result to the
|
|
||||||
/// destination.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __V
|
|
||||||
/// A 128-bit integer vector containing the expanded key.
|
|
||||||
/// \returns A 128-bit integer vector containing the transformed value.
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
||||||
_mm_aesimc_si128(__m128i __V)
|
|
||||||
{
|
|
||||||
return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Generates a round key for AES encryption, operating on 128-bit data
|
|
||||||
/// specified in the first source operand and using an 8-bit round constant
|
|
||||||
/// specified by the second source operand, and writes the result to the
|
|
||||||
/// destination.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param C
|
|
||||||
/// A 128-bit integer vector that is used to generate the AES encryption key.
|
|
||||||
/// \param R
|
|
||||||
/// An 8-bit round constant used to generate the AES encryption key.
|
|
||||||
/// \returns A 128-bit round key for AES encryption.
|
|
||||||
#define _mm_aeskeygenassist_si128(C, R) \
|
|
||||||
((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
|
|
||||||
#endif /* __WMMINTRIN_AES_H */
|
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __WMMINTRIN_H
|
|
||||||
#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __WMMINTRIN_PCLMUL_H
|
|
||||||
#define __WMMINTRIN_PCLMUL_H
|
|
||||||
|
|
||||||
/// Multiplies two 64-bit integer values, which are selected from source
|
|
||||||
/// operands using the immediate-value operand. The multiplication is a
|
|
||||||
/// carry-less multiplication, and the 128-bit integer product is stored in
|
|
||||||
/// the destination.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param X
|
|
||||||
/// A 128-bit vector of [2 x i64] containing one of the source operands.
|
|
||||||
/// \param Y
|
|
||||||
/// A 128-bit vector of [2 x i64] containing one of the source operands.
|
|
||||||
/// \param I
|
|
||||||
/// An immediate value specifying which 64-bit values to select from the
|
|
||||||
/// operands. Bit 0 is used to select a value from operand \a X, and bit
|
|
||||||
/// 4 is used to select a value from operand \a Y: \n
|
|
||||||
/// Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
|
|
||||||
/// Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
|
|
||||||
/// Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
|
|
||||||
/// Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
|
|
||||||
/// \returns The 128-bit integer vector containing the result of the carry-less
|
|
||||||
/// multiplication of the selected 64-bit values.
|
|
||||||
#define _mm_clmulepi64_si128(X, Y, I) \
|
|
||||||
((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
|
|
||||||
(__v2di)(__m128i)(Y), (char)(I)))
|
|
||||||
|
|
||||||
#endif /* __WMMINTRIN_PCLMUL_H */
|
|
||||||
@@ -1,165 +0,0 @@
|
|||||||
/*===---- adcintrin.h - ADC intrinsics -------------------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __ADCINTRIN_H
|
|
||||||
#define __ADCINTRIN_H
|
|
||||||
|
|
||||||
#if !defined(__i386__) && !defined(__x86_64__)
|
|
||||||
#error "This header is only meant to be used on x86 and x64 architecture"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
|
||||||
#define __DEFAULT_FN_ATTRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__)) constexpr
|
|
||||||
#else
|
|
||||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Use C++ inline semantics in C++, GNU inline for C mode. */
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
#define __INLINE __inline
|
|
||||||
#else
|
|
||||||
#define __INLINE static __inline
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
|
|
||||||
/// by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
|
|
||||||
/// at \a __p, and returns the 8-bit carry-out (carry flag).
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// temp := (__cf == 0) ? 0 : 1
|
|
||||||
/// Store32(__p, __x + __y + temp)
|
|
||||||
/// result := CF
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c ADC instruction.
|
|
||||||
///
|
|
||||||
/// \param __cf
|
|
||||||
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
|
||||||
/// \param __x
|
|
||||||
/// A 32-bit unsigned addend.
|
|
||||||
/// \param __y
|
|
||||||
/// A 32-bit unsigned addend.
|
|
||||||
/// \param __p
|
|
||||||
/// Pointer to memory for storing the sum.
|
|
||||||
/// \returns The 8-bit unsigned carry-out value.
|
|
||||||
__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
|
|
||||||
unsigned int __x,
|
|
||||||
unsigned int __y,
|
|
||||||
unsigned int *__p) {
|
|
||||||
return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
|
|
||||||
/// flag \a __cf, and subtracts the result from unsigned 32-bit integer
|
|
||||||
/// \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
|
|
||||||
/// and returns the 8-bit carry-out (carry or overflow flag).
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// temp := (__cf == 0) ? 0 : 1
|
|
||||||
/// Store32(__p, __x - (__y + temp))
|
|
||||||
/// result := CF
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c SBB instruction.
|
|
||||||
///
|
|
||||||
/// \param __cf
|
|
||||||
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
|
||||||
/// \param __x
|
|
||||||
/// The 32-bit unsigned minuend.
|
|
||||||
/// \param __y
|
|
||||||
/// The 32-bit unsigned subtrahend.
|
|
||||||
/// \param __p
|
|
||||||
/// Pointer to memory for storing the difference.
|
|
||||||
/// \returns The 8-bit unsigned carry-out value.
|
|
||||||
__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
|
|
||||||
unsigned int __x,
|
|
||||||
unsigned int __y,
|
|
||||||
unsigned int *__p) {
|
|
||||||
return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __x86_64__
|
|
||||||
/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
|
|
||||||
/// by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
|
|
||||||
/// at \a __p, and returns the 8-bit carry-out (carry flag).
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// temp := (__cf == 0) ? 0 : 1
|
|
||||||
/// Store64(__p, __x + __y + temp)
|
|
||||||
/// result := CF
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c ADC instruction.
|
|
||||||
///
|
|
||||||
/// \param __cf
|
|
||||||
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
|
||||||
/// \param __x
|
|
||||||
/// A 64-bit unsigned addend.
|
|
||||||
/// \param __y
|
|
||||||
/// A 64-bit unsigned addend.
|
|
||||||
/// \param __p
|
|
||||||
/// Pointer to memory for storing the sum.
|
|
||||||
/// \returns The 8-bit unsigned carry-out value.
|
|
||||||
__INLINE unsigned char __DEFAULT_FN_ATTRS
|
|
||||||
_addcarry_u64(unsigned char __cf, unsigned long long __x,
|
|
||||||
unsigned long long __y, unsigned long long *__p) {
|
|
||||||
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
|
|
||||||
/// flag \a __cf, and subtracts the result from unsigned 64-bit integer
|
|
||||||
/// \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
|
|
||||||
/// and returns the 8-bit carry-out (carry or overflow flag).
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// temp := (__cf == 0) ? 0 : 1
|
|
||||||
/// Store64(__p, __x - (__y + temp))
|
|
||||||
/// result := CF
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c ADC instruction.
|
|
||||||
///
|
|
||||||
/// \param __cf
|
|
||||||
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
|
||||||
/// \param __x
|
|
||||||
/// The 64-bit unsigned minuend.
|
|
||||||
/// \param __y
|
|
||||||
/// The 64-bit unsigned subtrahend.
|
|
||||||
/// \param __p
|
|
||||||
/// Pointer to memory for storing the difference.
|
|
||||||
/// \returns The 8-bit unsigned carry-out value.
|
|
||||||
__INLINE unsigned char __DEFAULT_FN_ATTRS
|
|
||||||
_subborrow_u64(unsigned char __cf, unsigned long long __x,
|
|
||||||
unsigned long long __y, unsigned long long *__p) {
|
|
||||||
return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef __INLINE
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
|
|
||||||
#endif /* __ADCINTRIN_H */
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __ADXINTRIN_H
|
|
||||||
#define __ADXINTRIN_H
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
|
||||||
#define __DEFAULT_FN_ATTRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("adx"))) constexpr
|
|
||||||
#else
|
|
||||||
#define __DEFAULT_FN_ATTRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("adx")))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Use C++ inline semantics in C++, GNU inline for C mode. */
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
#define __INLINE __inline
|
|
||||||
#else
|
|
||||||
#define __INLINE static __inline
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Intrinsics that are available only if __ADX__ is defined. */
|
|
||||||
|
|
||||||
/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
|
|
||||||
/// by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
|
|
||||||
/// at \a __p, and returns the 8-bit carry-out (carry flag).
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// temp := (__cf == 0) ? 0 : 1
|
|
||||||
/// Store32(__p, __x + __y + temp)
|
|
||||||
/// result := CF
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c ADCX instruction.
|
|
||||||
///
|
|
||||||
/// \param __cf
|
|
||||||
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
|
||||||
/// \param __x
|
|
||||||
/// A 32-bit unsigned addend.
|
|
||||||
/// \param __y
|
|
||||||
/// A 32-bit unsigned addend.
|
|
||||||
/// \param __p
|
|
||||||
/// Pointer to memory for storing the sum.
|
|
||||||
/// \returns The 8-bit unsigned carry-out value.
|
|
||||||
__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
|
|
||||||
unsigned int __x,
|
|
||||||
unsigned int __y,
|
|
||||||
unsigned int *__p) {
|
|
||||||
return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __x86_64__
|
|
||||||
/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
|
|
||||||
/// by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
|
|
||||||
/// at \a __p, and returns the 8-bit carry-out (carry flag).
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// temp := (__cf == 0) ? 0 : 1
|
|
||||||
/// Store64(__p, __x + __y + temp)
|
|
||||||
/// result := CF
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c ADCX instruction.
|
|
||||||
///
|
|
||||||
/// \param __cf
|
|
||||||
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
|
||||||
/// \param __x
|
|
||||||
/// A 64-bit unsigned addend.
|
|
||||||
/// \param __y
|
|
||||||
/// A 64-bit unsigned addend.
|
|
||||||
/// \param __p
|
|
||||||
/// Pointer to memory for storing the sum.
|
|
||||||
/// \returns The 8-bit unsigned carry-out value.
|
|
||||||
__INLINE unsigned char __DEFAULT_FN_ATTRS
|
|
||||||
_addcarryx_u64(unsigned char __cf, unsigned long long __x,
|
|
||||||
unsigned long long __y, unsigned long long *__p) {
|
|
||||||
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef __INLINE
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
|
|
||||||
#endif /* __ADXINTRIN_H */
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,191 +0,0 @@
|
|||||||
//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
|
|
||||||
//
|
|
||||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
// See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
#ifndef __AMDGPUINTRIN_H
|
|
||||||
#define __AMDGPUINTRIN_H
|
|
||||||
|
|
||||||
#ifndef __AMDGPU__
|
|
||||||
#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __GPUINTRIN_H
|
|
||||||
#error "Never use <amdgpuintrin.h> directly; include <gpuintrin.h> instead"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
_Pragma("omp begin declare target device_type(nohost)");
|
|
||||||
_Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
|
|
||||||
|
|
||||||
// Type aliases to the address spaces used by the AMDGPU backend.
|
|
||||||
#define __gpu_private __attribute__((address_space(5)))
|
|
||||||
#define __gpu_constant __attribute__((address_space(4)))
|
|
||||||
#define __gpu_local __attribute__((address_space(3)))
|
|
||||||
#define __gpu_global __attribute__((address_space(1)))
|
|
||||||
#define __gpu_generic __attribute__((address_space(0)))
|
|
||||||
|
|
||||||
// Attribute to declare a function as a kernel.
|
|
||||||
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
|
|
||||||
|
|
||||||
// Returns the number of workgroups in the 'x' dimension of the grid.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
|
|
||||||
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the number of workgroups in the 'y' dimension of the grid.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
|
|
||||||
return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the number of workgroups in the 'z' dimension of the grid.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
|
|
||||||
return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the 'x' dimension of the current AMD workgroup's id.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
|
|
||||||
return __builtin_amdgcn_workgroup_id_x();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the 'y' dimension of the current AMD workgroup's id.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
|
|
||||||
return __builtin_amdgcn_workgroup_id_y();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the 'z' dimension of the current AMD workgroup's id.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
|
|
||||||
return __builtin_amdgcn_workgroup_id_z();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the number of workitems in the 'x' dimension.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
|
|
||||||
return __builtin_amdgcn_workgroup_size_x();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the number of workitems in the 'y' dimension.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
|
|
||||||
return __builtin_amdgcn_workgroup_size_y();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the number of workitems in the 'z' dimension.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
|
|
||||||
return __builtin_amdgcn_workgroup_size_z();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
|
|
||||||
return __builtin_amdgcn_workitem_id_x();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
|
|
||||||
return __builtin_amdgcn_workitem_id_y();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
|
|
||||||
return __builtin_amdgcn_workitem_id_z();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
|
|
||||||
// and compilation options.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
|
|
||||||
return __builtin_amdgcn_wavefrontsize();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the id of the thread inside of an AMD wavefront executing together.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
|
|
||||||
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the bit-mask of active threads in the current wavefront.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
|
|
||||||
return __builtin_amdgcn_read_exec();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Copies the value from the first active thread in the wavefront to the rest.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t
|
|
||||||
__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
|
|
||||||
return __builtin_amdgcn_readfirstlane(__x);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns a bitmask of threads in the current lane for which \p x is true.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
|
|
||||||
bool __x) {
|
|
||||||
// The lane_mask & gives the nvptx semantics when lane_mask is a subset of
|
|
||||||
// the active threads
|
|
||||||
return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Waits for all the threads in the block to converge and issues a fence.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
|
|
||||||
__builtin_amdgcn_s_barrier();
|
|
||||||
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
|
|
||||||
__builtin_amdgcn_wave_barrier();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Shuffles the the lanes inside the wavefront according to the given index.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint32_t
|
|
||||||
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
|
|
||||||
uint32_t __width) {
|
|
||||||
uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
|
|
||||||
return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns a bitmask marking all lanes that have the same value of __x.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
|
||||||
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
|
|
||||||
return __gpu_match_any_u32_impl(__lane_mask, __x);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns a bitmask marking all lanes that have the same value of __x.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
|
||||||
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
|
|
||||||
return __gpu_match_any_u64_impl(__lane_mask, __x);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the current lane mask if every lane contains __x.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
|
||||||
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
|
|
||||||
return __gpu_match_all_u32_impl(__lane_mask, __x);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the current lane mask if every lane contains __x.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ uint64_t
|
|
||||||
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
|
|
||||||
return __gpu_match_all_u64_impl(__lane_mask, __x);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
|
|
||||||
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
|
|
||||||
void [[clang::opencl_generic]] *)ptr));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns true if the flat pointer points to AMDGPU 'private' memory.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
|
|
||||||
return __builtin_amdgcn_is_private((void [[clang::address_space(0)]] *)((
|
|
||||||
void [[clang::opencl_generic]] *)ptr));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Terminates execution of the associated wavefront.
|
|
||||||
_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
|
|
||||||
__builtin_amdgcn_endpgm();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Suspend the thread briefly to assist the scheduler during busy loops.
|
|
||||||
_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
|
|
||||||
__builtin_amdgcn_s_sleep(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
_Pragma("omp end declare variant");
|
|
||||||
_Pragma("omp end declare target");
|
|
||||||
|
|
||||||
#endif // __AMDGPUINTRIN_H
|
|
||||||
@@ -1,183 +0,0 @@
|
|||||||
/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __AMMINTRIN_H
|
|
||||||
#define __AMMINTRIN_H
|
|
||||||
|
|
||||||
#if !defined(__i386__) && !defined(__x86_64__)
|
|
||||||
#error "This header is only meant to be used on x86 and x64 architecture"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <pmmintrin.h>
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
|
|
||||||
|
|
||||||
/// Extracts the specified bits from the lower 64 bits of the 128-bit
|
|
||||||
/// integer vector operand at the index \a idx and of the length \a len.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param x
|
|
||||||
/// The value from which bits are extracted.
|
|
||||||
/// \param len
|
|
||||||
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
|
|
||||||
/// are zero, the length is interpreted as 64.
|
|
||||||
/// \param idx
|
|
||||||
/// Bits [5:0] specify the index of the least significant bit; the other
|
|
||||||
/// bits are ignored. If the sum of the index and length is greater than 64,
|
|
||||||
/// the result is undefined. If the length and index are both zero, bits
|
|
||||||
/// [63:0] of parameter \a x are extracted. If the length is zero but the
|
|
||||||
/// index is non-zero, the result is undefined.
|
|
||||||
/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
|
|
||||||
/// extracted from the source operand.
|
|
||||||
#define _mm_extracti_si64(x, len, idx) \
|
|
||||||
((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
|
|
||||||
(char)(len), (char)(idx)))
|
|
||||||
|
|
||||||
/// Extracts the specified bits from the lower 64 bits of the 128-bit
|
|
||||||
/// integer vector operand at the index and of the length specified by
|
|
||||||
/// \a __y.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __x
|
|
||||||
/// The value from which bits are extracted.
|
|
||||||
/// \param __y
|
|
||||||
/// Specifies the index of the least significant bit at [13:8] and the
|
|
||||||
/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
|
|
||||||
/// length is interpreted as 64. If the sum of the index and length is
|
|
||||||
/// greater than 64, the result is undefined. If the length and index are
|
|
||||||
/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
|
|
||||||
/// is zero but the index is non-zero, the result is undefined.
|
|
||||||
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
|
|
||||||
/// from the source operand.
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
||||||
_mm_extract_si64(__m128i __x, __m128i __y)
|
|
||||||
{
|
|
||||||
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Inserts bits of a specified length from the source integer vector
|
|
||||||
/// \a y into the lower 64 bits of the destination integer vector \a x at
|
|
||||||
/// the index \a idx and of the length \a len.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
|
|
||||||
/// const int idx);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param x
|
|
||||||
/// The destination operand where bits will be inserted. The inserted bits
|
|
||||||
/// are defined by the length \a len and by the index \a idx specifying the
|
|
||||||
/// least significant bit.
|
|
||||||
/// \param y
|
|
||||||
/// The source operand containing the bits to be extracted. The extracted
|
|
||||||
/// bits are the least significant bits of operand \a y of length \a len.
|
|
||||||
/// \param len
|
|
||||||
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
|
|
||||||
/// are zero, the length is interpreted as 64.
|
|
||||||
/// \param idx
|
|
||||||
/// Bits [5:0] specify the index of the least significant bit; the other
|
|
||||||
/// bits are ignored. If the sum of the index and length is greater than 64,
|
|
||||||
/// the result is undefined. If the length and index are both zero, bits
|
|
||||||
/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
|
|
||||||
/// is zero but the index is non-zero, the result is undefined.
|
|
||||||
/// \returns A 128-bit integer vector containing the original lower 64-bits of
|
|
||||||
/// destination operand \a x with the specified bitfields replaced by the
|
|
||||||
/// lower bits of source operand \a y. The upper 64 bits of the return value
|
|
||||||
/// are undefined.
|
|
||||||
#define _mm_inserti_si64(x, y, len, idx) \
|
|
||||||
((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
|
|
||||||
(__v2di)(__m128i)(y), \
|
|
||||||
(char)(len), (char)(idx)))
|
|
||||||
|
|
||||||
/// Inserts bits of a specified length from the source integer vector
|
|
||||||
/// \a __y into the lower 64 bits of the destination integer vector \a __x
|
|
||||||
/// at the index and of the length specified by \a __y.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __x
|
|
||||||
/// The destination operand where bits will be inserted. The inserted bits
|
|
||||||
/// are defined by the length and by the index of the least significant bit
|
|
||||||
/// specified by operand \a __y.
|
|
||||||
/// \param __y
|
|
||||||
/// The source operand containing the bits to be extracted. The extracted
|
|
||||||
/// bits are the least significant bits of operand \a __y with length
|
|
||||||
/// specified by bits [69:64]. These are inserted into the destination at the
|
|
||||||
/// index specified by bits [77:72]; all other bits are ignored. If bits
|
|
||||||
/// [69:64] are zero, the length is interpreted as 64. If the sum of the
|
|
||||||
/// index and length is greater than 64, the result is undefined. If the
|
|
||||||
/// length and index are both zero, bits [63:0] of parameter \a __y are
|
|
||||||
/// inserted into parameter \a __x. If the length is zero but the index is
|
|
||||||
/// non-zero, the result is undefined.
|
|
||||||
/// \returns A 128-bit integer vector containing the original lower 64-bits of
|
|
||||||
/// destination operand \a __x with the specified bitfields replaced by the
|
|
||||||
/// lower bits of source operand \a __y. The upper 64 bits of the return
|
|
||||||
/// value are undefined.
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
||||||
_mm_insert_si64(__m128i __x, __m128i __y)
|
|
||||||
{
|
|
||||||
return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Stores a 64-bit double-precision value in a 64-bit memory location.
|
|
||||||
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
|
|
||||||
/// used again soon).
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __p
|
|
||||||
/// The 64-bit memory location used to store the register value.
|
|
||||||
/// \param __a
|
|
||||||
/// The 64-bit double-precision floating-point register value to be stored.
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS
|
|
||||||
_mm_stream_sd(void *__p, __m128d __a)
|
|
||||||
{
|
|
||||||
__builtin_ia32_movntsd((double *)__p, (__v2df)__a);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Stores a 32-bit single-precision floating-point value in a 32-bit
|
|
||||||
/// memory location. To minimize caching, the data is flagged as
|
|
||||||
/// non-temporal (unlikely to be used again soon).
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __p
|
|
||||||
/// The 32-bit memory location used to store the register value.
|
|
||||||
/// \param __a
|
|
||||||
/// The 32-bit single-precision floating-point register value to be stored.
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS
|
|
||||||
_mm_stream_ss(void *__p, __m128 __a)
|
|
||||||
{
|
|
||||||
__builtin_ia32_movntss((float *)__p, (__v4sf)__a);
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
|
|
||||||
#endif /* __AMMINTRIN_H */
|
|
||||||
@@ -1,382 +0,0 @@
|
|||||||
/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===------------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif // __IMMINTRIN_H
|
|
||||||
|
|
||||||
#ifndef __AMX_AVX512INTRIN_H
|
|
||||||
#define __AMX_AVX512INTRIN_H
|
|
||||||
#if defined(__x86_64__) && defined(__SSE2__)
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS_AVX512 \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, \
|
|
||||||
__target__("amx-avx512,avx10.2-512")))
|
|
||||||
|
|
||||||
/// Moves a row from a tile register to a zmm destination register, converting
|
|
||||||
/// the int32 source elements to fp32. The row of the tile is selected by a
|
|
||||||
/// 32b GPR.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// VL := 512
|
|
||||||
/// VL_bytes := VL >> 3
|
|
||||||
/// row_index := row & 0xffff
|
|
||||||
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
|
|
||||||
/// FOR i := 0 TO (VL_bytes / 4) - 1
|
|
||||||
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
|
|
||||||
/// dst.dword[i] := 0
|
|
||||||
/// ELSE
|
|
||||||
/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
|
|
||||||
/// FI
|
|
||||||
/// ENDFOR
|
|
||||||
/// dst[MAX_VL-1:VL] := 0
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
|
|
||||||
///
|
|
||||||
/// \param tsrc
|
|
||||||
/// The source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param row
|
|
||||||
/// The row of the source tile
|
|
||||||
#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
|
|
||||||
|
|
||||||
/// Moves a row from a tile register to a zmm destination register, converting
|
|
||||||
/// the fp32 source elements to bf16. It places the resulting bf16 elements
|
|
||||||
/// in the high 16 bits within each dword. The row of the tile is selected
|
|
||||||
/// by a 32b GPR.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// __m512i _tile_cvtrowps2bf16h(__tile tsrc, unsigned int row);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// VL := 512
|
|
||||||
/// VL_bytes := VL >> 3
|
|
||||||
/// row_index := row & 0xffff
|
|
||||||
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
|
|
||||||
/// FOR i := 0 TO (VL_bytes / 4) - 1
|
|
||||||
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
|
|
||||||
/// dst.dword[i] := 0
|
|
||||||
/// ELSE
|
|
||||||
/// dst.word[2*i+0] := 0
|
|
||||||
/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
|
|
||||||
/// FI
|
|
||||||
/// ENDFOR
|
|
||||||
/// dst[MAX_VL-1:VL] := 0
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction.
|
|
||||||
///
|
|
||||||
/// \param tsrc
|
|
||||||
/// The source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param row
|
|
||||||
/// The the row of the source tile.
|
|
||||||
#define _tile_cvtrowps2bf16h(tsrc, row) \
|
|
||||||
__builtin_ia32_tcvtrowps2bf16h(tsrc, row)
|
|
||||||
|
|
||||||
/// Moves a row from a tile register to a zmm destination register, converting
|
|
||||||
/// the fp32 source elements to bf16. It places the resulting bf16 elements
|
|
||||||
/// in the low 16 bits within each dword. The row of the tile is selected
|
|
||||||
/// by a 32b GPR.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// __m512i _tile_cvtrowps2bf16l(__tile tsrc, unsigned int row);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// VL := 512
|
|
||||||
/// VL_bytes := VL >> 3
|
|
||||||
/// row_index := row & 0xffff
|
|
||||||
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
|
|
||||||
/// FOR i := 0 TO (VL_bytes / 4) - 1
|
|
||||||
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
|
|
||||||
/// dst.dword[i] := 0
|
|
||||||
/// ELSE
|
|
||||||
/// dst.word[2*i+1] := 0
|
|
||||||
/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
|
|
||||||
/// FI
|
|
||||||
/// ENDFOR
|
|
||||||
/// dst[MAX_VL-1:VL] := 0
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction.
|
|
||||||
///
|
|
||||||
/// \param tsrc
|
|
||||||
/// The source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param row
|
|
||||||
/// The the row of the source tile.
|
|
||||||
#define _tile_cvtrowps2bf16l(tsrc, row) \
|
|
||||||
__builtin_ia32_tcvtrowps2bf16l(tsrc, row)
|
|
||||||
|
|
||||||
/// Moves a row from a tile register to a zmm destination register, converting
|
|
||||||
/// the fp32 source elements to fp16. It places the resulting fp16 elements
|
|
||||||
/// in the high 16 bits within each dword. The row of the tile is selected
|
|
||||||
/// by a 32b GPR.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// VL := 512
|
|
||||||
/// VL_bytes := VL >> 3
|
|
||||||
/// row_index := row & 0xffff
|
|
||||||
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
|
|
||||||
/// FOR i := 0 TO (VL_bytes / 4) - 1
|
|
||||||
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
|
|
||||||
/// dst.dword[i] := 0
|
|
||||||
/// ELSE
|
|
||||||
/// dst.word[2*i+0] := 0
|
|
||||||
/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
|
|
||||||
/// FI
|
|
||||||
/// ENDFOR
|
|
||||||
/// dst[MAX_VL-1:VL] := 0
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
|
|
||||||
///
|
|
||||||
/// \param tsrc
|
|
||||||
/// The source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param row
|
|
||||||
/// The the row of the source tile.
|
|
||||||
#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row)
|
|
||||||
|
|
||||||
/// Moves a row from a tile register to a zmm destination register, converting
|
|
||||||
/// the fp32 source elements to fp16. It places the resulting fp16 elements
|
|
||||||
/// in the low 16 bits within each dword. The row of the tile is selected
|
|
||||||
/// by a 32b GPR.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// VL := 512
|
|
||||||
/// VL_bytes := VL >> 3
|
|
||||||
/// row_index := row & 0xffff
|
|
||||||
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
|
|
||||||
/// FOR i := 0 TO (VL_bytes / 4) - 1
|
|
||||||
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
|
|
||||||
/// dst.dword[i] := 0
|
|
||||||
/// ELSE
|
|
||||||
/// dst.word[2*i+1] := 0
|
|
||||||
/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
|
|
||||||
/// FI
|
|
||||||
/// ENDFOR
|
|
||||||
/// dst[MAX_VL-1:VL] := 0
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
|
|
||||||
///
|
|
||||||
/// \param tsrc
|
|
||||||
/// The source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param row
|
|
||||||
/// The the row of the source tile.
|
|
||||||
#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row)
|
|
||||||
|
|
||||||
/// Move one row of a tile data to a v16f32 data.
|
|
||||||
/// The row of the tile is selected by a 32b GPR.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// __m512 _tile_movrow(__tile a, unsigned b);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source r32. Size is 4 Bytes.
|
|
||||||
/// \returns
|
|
||||||
/// The destination v16f32 data. Size is 64 Bytes.
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// VL := 512
|
|
||||||
/// VL_bytes := VL>>3
|
|
||||||
/// row_index := b&0xffff
|
|
||||||
/// row_chunk := ((b>>16)&0xffff) * VL_bytes
|
|
||||||
/// FOR i := 0 TO (VL_bytes-1)
|
|
||||||
/// IF (row_chunk + i >= a.colsb)
|
|
||||||
/// dst.byte[i] := 0
|
|
||||||
/// ELSE
|
|
||||||
/// dst.byte[i] := a.row[row_index].byte[row_chunk+i]
|
|
||||||
/// ENDFOR
|
|
||||||
/// \endcode
|
|
||||||
#define _tile_movrow(a, b) ((__m512i)__builtin_ia32_tilemovrow(a, b))
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
|
|
||||||
static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
|
|
||||||
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
|
|
||||||
return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
|
|
||||||
_tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n,
|
|
||||||
_tile1024i src, unsigned u) {
|
|
||||||
return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
|
|
||||||
_tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n,
|
|
||||||
_tile1024i src, unsigned u) {
|
|
||||||
return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
|
|
||||||
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
|
|
||||||
return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
|
|
||||||
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
|
|
||||||
return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal(
|
|
||||||
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
|
|
||||||
return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source
|
|
||||||
/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if
|
|
||||||
/// MXCSR.RC=RNE. Embedded rounding is not supported.
|
|
||||||
/// The row and chunk elements of tile is fetched from 32bit src1.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source r32. Size is 4 Bytes.
|
|
||||||
/// \returns
|
|
||||||
/// The destination v16f32 data. Size is 64 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_AVX512
|
|
||||||
static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
|
|
||||||
return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
|
|
||||||
/// elements to bf16 at high 16-bits of each dword.
|
|
||||||
/// The row and chunk elements of tile is fetched from 32bit src1.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TCVTROWPS2BF16H </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source r32. Size is 4 Bytes.
|
|
||||||
/// \returns
|
|
||||||
/// The destination v32bf16 data. Size is 64 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_AVX512
|
|
||||||
static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) {
|
|
||||||
return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
|
|
||||||
/// elements to bf16 at low 16-bits of each dword.
|
|
||||||
/// The row and chunk elements of tile is fetched from 32bit src1.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TCVTROWPS2BF16L </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source r32. Size is 4 Bytes.
|
|
||||||
/// \returns
|
|
||||||
/// The destination v32bf16 data. Size is 64 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_AVX512
|
|
||||||
static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) {
|
|
||||||
return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
|
|
||||||
/// elements to fp16 at high 16-bits of each dword.
|
|
||||||
/// The row and chunk elements of tile is fetched from 32bit src1.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source r32. Size is 4 Bytes.
|
|
||||||
/// \returns
|
|
||||||
/// The destination v32fp16 data. Size is 64 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_AVX512
|
|
||||||
static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
|
|
||||||
return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
|
|
||||||
/// elements to fp16 at low 16-bits of each dword.
|
|
||||||
/// The row and chunk elements of tile is fetched from 32bit src1.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source r32. Size is 4 Bytes.
|
|
||||||
/// \returns
|
|
||||||
/// The destination v32fp16 data. Size is 64 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_AVX512
|
|
||||||
static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
|
|
||||||
return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Move one row of a tile data to a v16f32 data.
|
|
||||||
/// The row of the tile is selected by a 32b GPR.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source r32. Size is 4 Bytes.
|
|
||||||
/// \returns
|
|
||||||
/// The destination v16i32 data. Size is 64 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_AVX512
|
|
||||||
static __m512i __tile_movrow(__tile1024i src0, unsigned src1) {
|
|
||||||
return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // __x86_64__ && __SSE2__
|
|
||||||
#endif // __AMX_AVX512INTRIN_H
|
|
||||||
@@ -1,94 +0,0 @@
|
|||||||
/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===------------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
|
|
||||||
#endif /* __IMMINTRIN_H */
|
|
||||||
|
|
||||||
#ifndef __AMX_BF16TRANSPOSEINTRIN_H
|
|
||||||
#define __AMX_BF16TRANSPOSEINTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#define __DEFAULT_FN_ATTRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, \
|
|
||||||
__target__("amx-bf16,amx-transpose")))
|
|
||||||
|
|
||||||
/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
|
|
||||||
/// tiles \a a and \a b, accumulating the intermediate single-precision
|
|
||||||
/// (32-bit) floating-point elements with elements in \a dst, and store the
|
|
||||||
/// 32-bit result back to tile \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// tmp := dst.row[m]
|
|
||||||
/// FOR k := 0 TO (a.colsb / 4) - 1
|
|
||||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
|
||||||
/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
|
|
||||||
/// FP32(b.row[k].bf16[2*n+0])
|
|
||||||
/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
|
|
||||||
/// FP32(b.row[k].bf16[2*n+1])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// ENDFOR
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b))
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS
|
|
||||||
_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
|
|
||||||
/// tiles src0 and src1, accumulating the intermediate single-precision
|
|
||||||
/// (32-bit) floating-point elements with elements in "dst", and store the
|
|
||||||
/// 32-bit result back to tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
|
||||||
src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
|
|
||||||
#endif /* __x86_64__ */
|
|
||||||
#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
|
|
||||||
@@ -1,167 +0,0 @@
|
|||||||
/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===------------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif // __IMMINTRIN_H
|
|
||||||
|
|
||||||
#ifndef __AMX_COMPLEXINTRIN_H
|
|
||||||
#define __AMX_COMPLEXINTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS_COMPLEX \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
|
|
||||||
|
|
||||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
|
||||||
/// accumulate the results into a packed single precision tile. Each dword
|
|
||||||
/// element in input tiles \a a and \a b is interpreted as a complex number
|
|
||||||
/// with FP16 real part and FP16 imaginary part.
|
|
||||||
/// Calculates the imaginary part of the result. For each possible combination
|
|
||||||
/// of (row of \a a, column of \a b), it performs a set of multiplication
|
|
||||||
/// and accumulations on all corresponding complex numbers (one from \a a
|
|
||||||
/// and one from \a b). The imaginary part of the \a a element is multiplied
|
|
||||||
/// with the real part of the corresponding \a b element, and the real part
|
|
||||||
/// of the \a a element is multiplied with the imaginary part of the
|
|
||||||
/// corresponding \a b elements. The two accumulated results are added, and
|
|
||||||
/// then accumulated into the corresponding row and column of \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// tmp := dst.row[m]
|
|
||||||
/// FOR k := 0 TO (a.colsb / 4) - 1
|
|
||||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// ENDFOR
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
|
|
||||||
|
|
||||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
|
||||||
/// accumulate the results into a packed single precision tile. Each dword
|
|
||||||
/// element in input tiles \a a and \a b is interpreted as a complex number
|
|
||||||
/// with FP16 real part and FP16 imaginary part.
|
|
||||||
/// Calculates the real part of the result. For each possible combination
|
|
||||||
/// of (row of \a a, column of \a b), it performs a set of multiplication
|
|
||||||
/// and accumulations on all corresponding complex numbers (one from \a a
|
|
||||||
/// and one from \a b). The real part of the \a a element is multiplied
|
|
||||||
/// with the real part of the corresponding \a b element, and the negated
|
|
||||||
/// imaginary part of the \a a element is multiplied with the imaginary
|
|
||||||
/// part of the corresponding \a b elements. The two accumulated results
|
|
||||||
/// are added, and then accumulated into the corresponding row and column
|
|
||||||
/// of \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// tmp := dst.row[m]
|
|
||||||
/// FOR k := 0 TO (a.colsb / 4) - 1
|
|
||||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
|
|
||||||
/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// ENDFOR
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
|
|
||||||
_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
|
|
||||||
_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
|
||||||
/// accumulate the results into a packed single precision tile. Each dword
|
|
||||||
/// element in input tiles src0 and src1 is interpreted as a complex number with
|
|
||||||
/// FP16 real part and FP16 imaginary part.
|
|
||||||
/// This function calculates the imaginary part of the result.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_COMPLEX
|
|
||||||
__tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0, __tile1024i src1) {
|
|
||||||
dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
|
|
||||||
dst->tile, src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
|
||||||
/// accumulate the results into a packed single precision tile. Each dword
|
|
||||||
/// element in input tiles src0 and src1 is interpreted as a complex number with
|
|
||||||
/// FP16 real part and FP16 imaginary part.
|
|
||||||
/// This function calculates the real part of the result.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_COMPLEX
|
|
||||||
__tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0, __tile1024i src1) {
|
|
||||||
dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
|
|
||||||
dst->tile, src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // __x86_64__
|
|
||||||
#endif // __AMX_COMPLEXINTRIN_H
|
|
||||||
@@ -1,303 +0,0 @@
|
|||||||
/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===------------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif // __IMMINTRIN_H
|
|
||||||
|
|
||||||
#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
|
|
||||||
#define __AMX_COMPLEXTRANSPOSEINTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, \
|
|
||||||
__target__("amx-complex,amx-transpose")))
|
|
||||||
|
|
||||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
|
||||||
/// accumulate the results into a packed single precision tile. Each dword
|
|
||||||
/// element in input tiles \a a and \a b is interpreted as a complex number
|
|
||||||
/// with FP16 real part and FP16 imaginary part.
|
|
||||||
/// Calculates the imaginary part of the result. For each possible combination
|
|
||||||
/// of (transposed column of \a a, column of \a b), it performs a set of
|
|
||||||
/// multiplication and accumulations on all corresponding complex numbers
|
|
||||||
/// (one from \a a and one from \a b). The imaginary part of the \a a element
|
|
||||||
/// is multiplied with the real part of the corresponding \a b element, and
|
|
||||||
/// the real part of the \a a element is multiplied with the imaginary part
|
|
||||||
/// of the corresponding \a b elements. The two accumulated results are
|
|
||||||
/// added, and then accumulated into the corresponding row and column of
|
|
||||||
/// \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// tmp := dst.row[m]
|
|
||||||
/// FOR k := 0 TO a.rows - 1
|
|
||||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// ENDFOR
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_tcmmimfp16ps(dst, a, b) \
|
|
||||||
__builtin_ia32_ttcmmimfp16ps((dst), (a), (b))
|
|
||||||
|
|
||||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
|
||||||
/// accumulate the results into a packed single precision tile. Each dword
|
|
||||||
/// element in input tiles \a a and \a b is interpreted as a complex number
|
|
||||||
/// with FP16 real part and FP16 imaginary part.
|
|
||||||
/// Calculates the real part of the result. For each possible combination
|
|
||||||
/// of (rtransposed colum of \a a, column of \a b), it performs a set of
|
|
||||||
/// multiplication and accumulations on all corresponding complex numbers
|
|
||||||
/// (one from \a a and one from \a b). The real part of the \a a element is
|
|
||||||
/// multiplied with the real part of the corresponding \a b element, and the
|
|
||||||
/// negated imaginary part of the \a a element is multiplied with the
|
|
||||||
/// imaginary part of the corresponding \a b elements. The two accumulated
|
|
||||||
/// results are added, and then accumulated into the corresponding row and
|
|
||||||
/// column of \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// tmp := dst.row[m]
|
|
||||||
/// FOR k := 0 TO a.rows - 1
|
|
||||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
|
|
||||||
/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// ENDFOR
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_tcmmrlfp16ps(dst, a, b) \
|
|
||||||
__builtin_ia32_ttcmmrlfp16ps((dst), (a), (b))
|
|
||||||
|
|
||||||
/// Perform matrix conjugate transpose and multiplication of two tiles
|
|
||||||
/// containing complex elements and accumulate the results into a packed
|
|
||||||
/// single precision tile. Each dword element in input tiles \a a and \a b
|
|
||||||
/// is interpreted as a complex number with FP16 real part and FP16 imaginary
|
|
||||||
/// part.
|
|
||||||
/// Calculates the imaginary part of the result. For each possible combination
|
|
||||||
/// of (transposed column of \a a, column of \a b), it performs a set of
|
|
||||||
/// multiplication and accumulations on all corresponding complex numbers
|
|
||||||
/// (one from \a a and one from \a b). The negated imaginary part of the \a a
|
|
||||||
/// element is multiplied with the real part of the corresponding \a b
|
|
||||||
/// element, and the real part of the \a a element is multiplied with the
|
|
||||||
/// imaginary part of the corresponding \a b elements. The two accumulated
|
|
||||||
/// results are added, and then accumulated into the corresponding row and
|
|
||||||
/// column of \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// tmp := dst.row[m]
|
|
||||||
/// FOR k := 0 TO a.rows - 1
|
|
||||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
|
|
||||||
/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// ENDFOR
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_conjtcmmimfp16ps(dst, a, b) \
|
|
||||||
__builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b))
|
|
||||||
|
|
||||||
/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
|
|
||||||
/// and writes the result to \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_conjtfp16(__tile dst, __tile a);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR i := 0 TO dst.rows - 1
|
|
||||||
/// FOR j := 0 TO (dst.colsb / 4) - 1
|
|
||||||
/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
|
|
||||||
/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, i, tmp, dst.colsb)
|
|
||||||
/// ENDFOR
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a))
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
|
|
||||||
unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
|
|
||||||
_tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
|
|
||||||
unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
|
|
||||||
_tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
|
|
||||||
unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
|
|
||||||
_tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS
|
|
||||||
_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) {
|
|
||||||
return __builtin_ia32_tconjtfp16_internal(m, n, src);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
|
||||||
/// accumulate the results into a packed single precision tile. Each dword
|
|
||||||
/// element in input tiles src0 and src1 is interpreted as a complex number
|
|
||||||
/// with FP16 real part and FP16 imaginary part.
|
|
||||||
/// This function calculates the imaginary part of the result.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
|
|
||||||
dst->tile, src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
|
||||||
/// accumulate the results into a packed single precision tile. Each dword
|
|
||||||
/// element in input tiles src0 and src1 is interpreted as a complex number
|
|
||||||
/// with FP16 real part and FP16 imaginary part.
|
|
||||||
/// This function calculates the real part of the result.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
|
|
||||||
dst->tile, src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform matrix conjugate transpose and multiplication of two tiles
|
|
||||||
/// containing complex elements and accumulate the results into a packed
|
|
||||||
/// single precision tile. Each dword element in input tiles src0 and src1
|
|
||||||
/// is interpreted as a complex number with FP16 real part and FP16 imaginary
|
|
||||||
/// part.
|
|
||||||
/// This function calculates the imaginary part of the result.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
|
|
||||||
dst->tile, src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform conjugate transpose of an FP16-pair of complex elements from src and
|
|
||||||
/// writes the result to dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src
|
|
||||||
/// The source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
|
|
||||||
dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
|
|
||||||
#endif // __x86_64__
|
|
||||||
#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
|
|
||||||
@@ -1,93 +0,0 @@
|
|||||||
/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===------------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
|
|
||||||
#endif /* __IMMINTRIN_H */
|
|
||||||
|
|
||||||
#ifndef __AMX_FP16INTRIN_H
|
|
||||||
#define __AMX_FP16INTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#define __DEFAULT_FN_ATTRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
|
|
||||||
|
|
||||||
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
|
|
||||||
/// and \a b, accumulating the intermediate single-precision (32-bit)
|
|
||||||
/// floating-point elements with elements in \a dst, and store the 32-bit
|
|
||||||
/// result back to tile \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// tmp := dst.row[m]
|
|
||||||
/// FOR k := 0 TO (a.colsb / 4) - 1
|
|
||||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
|
|
||||||
/// FP32(b.row[k].fp16[2*n+0])
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
|
|
||||||
/// FP32(b.row[k].fp16[2*n+1])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// ENDFOR
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TDPFP16PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_dpfp16ps(dst, a, b) \
|
|
||||||
__builtin_ia32_tdpfp16ps(dst, a, b)
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS
|
|
||||||
_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
|
|
||||||
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
|
||||||
/// elements with elements in "dst", and store the 32-bit result back to tile
|
|
||||||
/// "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
|
||||||
src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
|
|
||||||
#endif /* __x86_64__ */
|
|
||||||
#endif /* __AMX_FP16INTRIN_H */
|
|
||||||
@@ -1,94 +0,0 @@
|
|||||||
/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===------------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <amxfp16transposeintrin.h> directly; use <immintrin.h> instead."
|
|
||||||
#endif /* __IMMINTRIN_H */
|
|
||||||
|
|
||||||
#ifndef __AMX_FP16TRANSPOSEINTRIN_H
|
|
||||||
#define __AMX_FP16TRANSPOSEINTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#define __DEFAULT_FN_ATTRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, \
|
|
||||||
__target__("amx-fp16,amx-transpose")))
|
|
||||||
|
|
||||||
/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
|
|
||||||
/// tiles \a a and \a b, accumulating the intermediate single-precision
|
|
||||||
/// (32-bit) floating-point elements with elements in \a dst, and store the
|
|
||||||
/// 32-bit result back to tile \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b)
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// tmp := dst.row[m]
|
|
||||||
/// FOR k := 0 TO (a.colsb / 4) - 1
|
|
||||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
|
|
||||||
/// FP32(b.row[k].fp16[2*n+0])
|
|
||||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
|
|
||||||
/// FP32(b.row[k].fp16[2*n+1])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// ENDFOR
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TTDPFP16PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps((dst), (a), (b))
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS
|
|
||||||
_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
|
|
||||||
/// tiles src0 and src1, accumulating the intermediate single-precision
|
|
||||||
/// (32-bit) floating-point elements with elements in "dst", and store the
|
|
||||||
/// 32-bit result back to tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TTDPFP16PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
|
||||||
src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
|
|
||||||
#endif /* __x86_64__ */
|
|
||||||
#endif /* __AMX_FP16TRANSPOSEINTRIN_H */
|
|
||||||
@@ -1,230 +0,0 @@
|
|||||||
/*===------------- amxfp8intrin.h - AMX intrinsics -*- C++ -*----------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===------------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error "Never use <amxfp8intrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif /* __IMMINTRIN_H */
|
|
||||||
|
|
||||||
#ifndef __AMXFP8INTRIN_H
|
|
||||||
#define __AMXFP8INTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS_FP8 \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-fp8")))
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
|
|
||||||
_tile_dpbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tdpbf8ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform the dot product of a BF8 value \a src1 by a BF8 value \a src2
|
|
||||||
/// accumulating into a Single Precision (FP32) source/dest \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void __tile_dpbf8ps (__tile1024i *dst, __tile1024i src1, __tile1024i src2)
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// temp1[(dst.colsb / 4 - 1) : 0] = 0
|
|
||||||
/// FOR k := 0 TO src1.colsb / 4 - 1
|
|
||||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
|
||||||
/// temp1[n] +=
|
|
||||||
/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
|
||||||
/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TDPBF8PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src2
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_FP8 static void
|
|
||||||
__tile_dpbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
|
|
||||||
dst->tile = _tile_dpbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
|
|
||||||
src1.tile, src2.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
|
|
||||||
_tile_dpbhf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tdpbhf8ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform the dot product of a BF8 value \a src1 by an HF8 value \a src2
|
|
||||||
/// accumulating into a Single Precision (FP32) source/dest \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void __tile_dpbhf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// temp1[(dst.colsb / 4 - 1) : 0] = 0
|
|
||||||
/// FOR k := 0 TO src1.colsb / 4 - 1
|
|
||||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
|
||||||
/// temp1[n] +=
|
|
||||||
/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
|
||||||
/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TDPBHF8PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src2
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_FP8 static void
|
|
||||||
__tile_dpbhf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
|
|
||||||
dst->tile = _tile_dpbhf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
|
|
||||||
src1.tile, src2.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
|
|
||||||
_tile_dphbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tdphbf8ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform the dot product of an HF8 value \a src1 by a BF8 value \a src2
|
|
||||||
/// accumulating into a Single Precision (FP32) source/dest \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void __tile_dphbf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// temp1[(dst.colsb / 4 - 1) : 0] = 0
|
|
||||||
/// FOR k := 0 TO src1.colsb / 4 - 1
|
|
||||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
|
||||||
/// temp1[n] +=
|
|
||||||
/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
|
||||||
/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TDPHBF8PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src2
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
|
|
||||||
__DEFAULT_FN_ATTRS_FP8 static void
|
|
||||||
__tile_dphbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
|
|
||||||
dst->tile = _tile_dphbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
|
|
||||||
src1.tile, src2.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
|
|
||||||
_tile_dphf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tdphf8ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform the dot product of an HF8 value \a src1 by an HF8 value \a src2
|
|
||||||
/// accumulating into a Single Precision (FP32) source/dest \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void __tile_dphf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// FOR m := 0 TO dst.rows - 1
|
|
||||||
/// temp1[(dst.colsb / 4 - 1) : 0] = 0
|
|
||||||
/// FOR k := 0 TO src1.colsb / 4 - 1
|
|
||||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
|
||||||
/// temp1[n] +=
|
|
||||||
/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
|
|
||||||
/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
|
||||||
/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the \c TDPHF8PS instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src2
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_FP8 static void
|
|
||||||
__tile_dphf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
|
|
||||||
dst->tile = _tile_dphf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
|
|
||||||
src1.tile, src2.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _tile_dpbf8ps(dst, src1, src2) \
|
|
||||||
__builtin_ia32_tdpbf8ps((dst), (src1), (src2))
|
|
||||||
#define _tile_dpbhf8ps(dst, src1, src2) \
|
|
||||||
__builtin_ia32_tdpbhf8ps((dst), (src1), (src2))
|
|
||||||
#define _tile_dphbf8ps(dst, src1, src2) \
|
|
||||||
__builtin_ia32_tdphbf8ps((dst), (src1), (src2))
|
|
||||||
#define _tile_dphf8ps(dst, src1, src2) \
|
|
||||||
__builtin_ia32_tdphf8ps((dst), (src1), (src2))
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS_FP8
|
|
||||||
|
|
||||||
#endif /* __x86_64__ */
|
|
||||||
#endif /* __AMXFP8INTRIN_H */
|
|
||||||
@@ -1,494 +0,0 @@
|
|||||||
/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===------------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif /* __IMMINTRIN_H */
|
|
||||||
|
|
||||||
#ifndef __AMXINTRIN_H
|
|
||||||
#define __AMXINTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#define __DEFAULT_FN_ATTRS_TILE \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
|
|
||||||
#define __DEFAULT_FN_ATTRS_INT8 \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
|
|
||||||
#define __DEFAULT_FN_ATTRS_BF16 \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
|
|
||||||
|
|
||||||
/// Load tile configuration from a 64-byte memory location specified by
|
|
||||||
/// "mem_addr". The tile configuration includes the tile type palette, the
|
|
||||||
/// number of bytes per row, and the number of rows. If the specified
|
|
||||||
/// palette_id is zero, that signifies the init state for both the tile
|
|
||||||
/// config and the tile data, and the tiles are zeroed. Any invalid
|
|
||||||
/// configurations will result in #GP fault.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __config
|
|
||||||
/// A pointer to 512-bits configuration
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE
|
|
||||||
_tile_loadconfig(const void *__config) {
|
|
||||||
__builtin_ia32_tile_loadconfig(__config);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Stores the current tile configuration to a 64-byte memory location
|
|
||||||
/// specified by "mem_addr". The tile configuration includes the tile type
|
|
||||||
/// palette, the number of bytes per row, and the number of rows. If tiles
|
|
||||||
/// are not configured, all zeroes will be stored to memory.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __config
|
|
||||||
/// A pointer to 512-bits configuration
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE
|
|
||||||
_tile_storeconfig(void *__config) {
|
|
||||||
__builtin_ia32_tile_storeconfig(__config);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Release the tile configuration to return to the init state, which
|
|
||||||
/// releases all storage it currently holds.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|
||||||
__builtin_ia32_tilerelease();
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
|
||||||
/// destination tile "dst" using the tile configuration previously configured
|
|
||||||
/// via "_tile_loadconfig".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// A destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
#define _tile_loadd(dst, base, stride) \
|
|
||||||
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
|
|
||||||
(__SIZE_TYPE__)(stride))
|
|
||||||
|
|
||||||
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
|
||||||
/// destination tile "dst" using the tile configuration previously configured
|
|
||||||
/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
|
|
||||||
/// that the data will likely not be reused in the near future and the data
|
|
||||||
/// caching can be optimized accordingly.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// A destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
#define _tile_stream_loadd(dst, base, stride) \
|
|
||||||
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
|
|
||||||
(__SIZE_TYPE__)(stride))
|
|
||||||
|
|
||||||
/// Store the tile specified by "src" to memory specifieid by "base" address and
|
|
||||||
/// "stride" using the tile configuration previously configured via
|
|
||||||
/// "_tile_loadconfig".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// A destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be stored in memory.
|
|
||||||
#define _tile_stored(dst, base, stride) \
|
|
||||||
__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
|
|
||||||
|
|
||||||
/// Zero the tile specified by "tdest".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param tile
|
|
||||||
/// The destination tile to be zero. Max size is 1024 Bytes.
|
|
||||||
#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
|
|
||||||
|
|
||||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
|
||||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
|
||||||
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
|
||||||
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
|
||||||
/// and store the 32-bit result back to tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_dpbssd(dst, src0, src1) \
|
|
||||||
__builtin_ia32_tdpbssd((dst), (src0), (src1))
|
|
||||||
|
|
||||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
|
||||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
|
||||||
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
|
||||||
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
|
|
||||||
/// in "dst", and store the 32-bit result back to tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_dpbsud(dst, src0, src1) \
|
|
||||||
__builtin_ia32_tdpbsud((dst), (src0), (src1))
|
|
||||||
|
|
||||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
|
||||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
|
||||||
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
|
||||||
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
|
||||||
/// and store the 32-bit result back to tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_dpbusd(dst, src0, src1) \
|
|
||||||
__builtin_ia32_tdpbusd((dst), (src0), (src1))
|
|
||||||
|
|
||||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
|
||||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
|
||||||
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
|
||||||
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
|
|
||||||
/// "dst", and store the 32-bit result back to tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_dpbuud(dst, src0, src1) \
|
|
||||||
__builtin_ia32_tdpbuud((dst), (src0), (src1))
|
|
||||||
|
|
||||||
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
|
|
||||||
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
|
||||||
/// elements with elements in "dst", and store the 32-bit result back to tile
|
|
||||||
/// "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
#define _tile_dpbf16ps(dst, src0, src1) \
|
|
||||||
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
|
|
||||||
|
|
||||||
/// AMX tile register size can be configured, the maximum size is 16x64=1024
|
|
||||||
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
|
|
||||||
/// represent 2D tile and the fixed size is maximum amx tile register size.
|
|
||||||
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
|
|
||||||
typedef int _tile1024i_1024a
|
|
||||||
__attribute__((__vector_size__(1024), __aligned__(1024)));
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
|
|
||||||
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
return __builtin_ia32_tileloadd64_internal(m, n, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
|
|
||||||
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
return __builtin_ia32_tileloaddt164_internal(m, n, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
|
||||||
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
|
||||||
_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
|
||||||
_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
|
||||||
_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE
|
|
||||||
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
|
|
||||||
__SIZE_TYPE__ stride, _tile1024i tile) {
|
|
||||||
return __builtin_ia32_tilestored64_internal(m, n, base,
|
|
||||||
(__SIZE_TYPE__)(stride), tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
|
|
||||||
_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This struct pack the shape and tile data together for user. We suggest
|
|
||||||
/// initializing the struct as early as possible, because compiler depends
|
|
||||||
/// on the shape information to do configure. The constant value is preferred
|
|
||||||
/// for optimization by compiler.
|
|
||||||
typedef struct __tile1024i_str {
|
|
||||||
const unsigned short row;
|
|
||||||
const unsigned short col;
|
|
||||||
_tile1024i tile;
|
|
||||||
} __tile1024i;
|
|
||||||
|
|
||||||
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
|
||||||
/// destination tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// A destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
__DEFAULT_FN_ATTRS_TILE
|
|
||||||
static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
|
||||||
/// destination tile "dst". This intrinsic provides a hint to the implementation
|
|
||||||
/// that the data will likely not be reused in the near future and the data
|
|
||||||
/// caching can be optimized accordingly.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// A destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
__DEFAULT_FN_ATTRS_TILE
|
|
||||||
static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
|
||||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
|
||||||
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
|
||||||
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
|
||||||
/// and store the 32-bit result back to tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_INT8
|
|
||||||
static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
|
|
||||||
src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
|
||||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
|
||||||
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
|
||||||
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
|
|
||||||
/// in "dst", and store the 32-bit result back to tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_INT8
|
|
||||||
static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
|
|
||||||
src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
|
||||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
|
||||||
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
|
||||||
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
|
||||||
/// and store the 32-bit result back to tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_INT8
|
|
||||||
static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
|
|
||||||
src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
|
||||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
|
||||||
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
|
||||||
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
|
|
||||||
/// "dst", and store the 32-bit result back to tile "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_INT8
|
|
||||||
static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
|
|
||||||
src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Store the tile specified by "src" to memory specifieid by "base" address and
|
|
||||||
/// "stride".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be stored in memory.
|
|
||||||
__DEFAULT_FN_ATTRS_TILE
|
|
||||||
static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
|
|
||||||
__tile1024i src) {
|
|
||||||
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Zero the tile specified by "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile to be zero. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_TILE
|
|
||||||
static __inline__ void __tile_zero(__tile1024i *dst) {
|
|
||||||
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
|
|
||||||
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
|
||||||
/// elements with elements in "dst", and store the 32-bit result back to tile
|
|
||||||
/// "dst".
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_BF16
|
|
||||||
static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
|
||||||
src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS_TILE
|
|
||||||
#undef __DEFAULT_FN_ATTRS_INT8
|
|
||||||
#undef __DEFAULT_FN_ATTRS_BF16
|
|
||||||
|
|
||||||
#endif /* __x86_64__ */
|
|
||||||
#endif /* __AMXINTRIN_H */
|
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
* ===-------------------------------------------------------------------=== */
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif /* __IMMINTRIN_H */
|
|
||||||
|
|
||||||
#ifndef __AMXMOVRSINTRIN_H
|
|
||||||
#define __AMXMOVRSINTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS_MOVRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-movrs")))
|
|
||||||
|
|
||||||
#define _tile_loaddrs(dst, base, stride) \
|
|
||||||
__builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \
|
|
||||||
(__SIZE_TYPE__)(stride))
|
|
||||||
#define _tile_stream_loaddrs(dst, base, stride) \
|
|
||||||
__builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \
|
|
||||||
(__SIZE_TYPE__)(stride))
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
|
|
||||||
_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
return __builtin_ia32_tileloaddrs64_internal(m, n, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
|
|
||||||
_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
return __builtin_ia32_tileloaddrst164_internal(m, n, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_MOVRS
|
|
||||||
__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
|
|
||||||
dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride);
|
|
||||||
}
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs(
|
|
||||||
__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
|
|
||||||
dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride);
|
|
||||||
}
|
|
||||||
#undef __DEFAULT_FN_ATTRS_MOVRS
|
|
||||||
#endif /* __x86_64__ */
|
|
||||||
#endif /* __AMXMOVRSINTRIN_H */
|
|
||||||
@@ -1,200 +0,0 @@
|
|||||||
/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
* ===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead."
|
|
||||||
#endif /* __IMMINTRIN_H */
|
|
||||||
|
|
||||||
#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
|
|
||||||
#define __AMX_MOVRS_TRANSPOSEINTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, \
|
|
||||||
__target__("amx-transpose,amx-movrs")))
|
|
||||||
|
|
||||||
#define _tile_2rpntlvwz0rs(tdst, base, stride) \
|
|
||||||
__builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
|
|
||||||
#define _tile_2rpntlvwz0rst1(tdst, base, stride) \
|
|
||||||
__builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
|
|
||||||
#define _tile_2rpntlvwz1rs(tdst, base, stride) \
|
|
||||||
__builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
|
|
||||||
#define _tile_2rpntlvwz1rst1(tdst, base, stride) \
|
|
||||||
__builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
|
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
|
|
||||||
unsigned short row, unsigned short col0, unsigned short col1,
|
|
||||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
// Use __tile1024i_1024a* to escape the alignment check in
|
|
||||||
// clang/test/Headers/x86-intrinsics-headers-clean.cpp
|
|
||||||
__builtin_ia32_t2rpntlvwz0rs_internal(
|
|
||||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
|
|
||||||
unsigned short row, unsigned short col0, unsigned short col1,
|
|
||||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
__builtin_ia32_t2rpntlvwz0rst1_internal(
|
|
||||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
|
|
||||||
unsigned short row, unsigned short col0, unsigned short col1,
|
|
||||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
__builtin_ia32_t2rpntlvwz1rs_internal(
|
|
||||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
|
|
||||||
unsigned short row, unsigned short col0, unsigned short col1,
|
|
||||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
__builtin_ia32_t2rpntlvwz1rst1_internal(
|
|
||||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
|
||||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
|
||||||
/// in memory is specified via a tsib; the second tile is after the first
|
|
||||||
/// one, separated by the same stride that separates each row.
|
|
||||||
/// The tile configuration for the destination tiles indicates the amount
|
|
||||||
/// of data to read from memory. The instruction will load a number of rows
|
|
||||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
|
||||||
/// is equal to the average width of the destination tiles. If the second
|
|
||||||
/// tile is configured with zero rows and columns, only the first tile will
|
|
||||||
/// be written.
|
|
||||||
/// Provides a hint to the implementation that the data will likely become
|
|
||||||
/// read shared in the near future and the data caching can be optimized.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst0
|
|
||||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param dst1
|
|
||||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
|
|
||||||
const void *base, __SIZE_TYPE__ stride) {
|
|
||||||
_tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
|
||||||
&dst1->tile, base, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
|
||||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
|
||||||
/// in memory is specified via a tsib; the second tile is after the first
|
|
||||||
/// one, separated by the same stride that separates each row.
|
|
||||||
/// The tile configuration for the destination tiles indicates the amount
|
|
||||||
/// of data to read from memory. The instruction will load a number of rows
|
|
||||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
|
||||||
/// is equal to the average width of the destination tiles. If the second
|
|
||||||
/// tile is configured with zero rows and columns, only the first tile will
|
|
||||||
/// be written.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst0
|
|
||||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param dst1
|
|
||||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
|
|
||||||
const void *base, __SIZE_TYPE__ stride) {
|
|
||||||
_tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
|
||||||
&dst1->tile, base, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
|
||||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
|
||||||
/// in memory is specified via a tsib; the second tile is after the first
|
|
||||||
/// one, separated by the same stride that separates each row.
|
|
||||||
/// The tile configuration for the destination tiles indicates the amount
|
|
||||||
/// of data to read from memory. The instruction will load a number of rows
|
|
||||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
|
||||||
/// is equal to the average width of the destination tiles. If the second
|
|
||||||
/// tile is configured with zero rows and columns, only the first tile will
|
|
||||||
/// be written. The last row will be not be read from memory but instead
|
|
||||||
/// filled with zeros.
|
|
||||||
/// Provides a hint to the implementation that the data will likely become
|
|
||||||
/// read shared in the near future and the data caching can be optimized.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst0
|
|
||||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param dst1
|
|
||||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
|
|
||||||
const void *base, __SIZE_TYPE__ stride) {
|
|
||||||
_tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
|
||||||
&dst1->tile, base, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
|
||||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
|
||||||
/// in memory is specified via a tsib; the second tile is after the first
|
|
||||||
/// one, separated by the same stride that separates each row.
|
|
||||||
/// The tile configuration for the destination tiles indicates the amount
|
|
||||||
/// of data to read from memory. The instruction will load a number of rows
|
|
||||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
|
||||||
/// is equal to the average width of the destination tiles. If the second
|
|
||||||
/// tile is configured with zero rows and columns, only the first tile will
|
|
||||||
/// be written. The last row will be not be read from memory but instead
|
|
||||||
/// filled with zeros.
|
|
||||||
/// Provides a hint to the implementation that the data will likely become
|
|
||||||
/// read shared in the near future and the data caching can be optimized.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst0
|
|
||||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param dst1
|
|
||||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
__DEFAULT_FN_ATTRS
|
|
||||||
static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
|
|
||||||
const void *base, __SIZE_TYPE__ stride) {
|
|
||||||
_tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
|
||||||
&dst1->tile, base, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
#endif /* __x86_64__ */
|
|
||||||
#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */
|
|
||||||
@@ -1,108 +0,0 @@
|
|||||||
/*===------------- amxtf32intrin.h - AMX_TF32 intrinsics -*- C++ -*---------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===------------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error "Never use <amxtf32intrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif // __IMMINTRIN_H
|
|
||||||
|
|
||||||
#ifndef __AMX_TF32INTRIN_H
|
|
||||||
#define __AMX_TF32INTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS_TF32 \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-tf32")))
|
|
||||||
|
|
||||||
/// Do Matrix Multiplication of \a a and \a b, and then do Matrix Plus
|
|
||||||
/// with \a srcdst.
|
|
||||||
/// All the calculation is base on float32 but with the lower 13-bit set to 0.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_mmultf32ps(constexpr int srcdst, constexpr int a, \
|
|
||||||
/// constexpr int b);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param srcdst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
|
|
||||||
/// dword[12:0] := 0
|
|
||||||
/// dword[31:13] := x[31:13]
|
|
||||||
/// return dword
|
|
||||||
/// }
|
|
||||||
///
|
|
||||||
/// DEFINE silence_snan_fp32(x[31:0]) {
|
|
||||||
/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
|
|
||||||
/// x.fraction[22] := 1
|
|
||||||
/// return x
|
|
||||||
/// }
|
|
||||||
///
|
|
||||||
/// elements_a := a.colsb / 4
|
|
||||||
/// elements_dest := srcdst.colsb / 4
|
|
||||||
///
|
|
||||||
/// FOR m = 0 TO (srcdst.rows-1)
|
|
||||||
/// tmp[511:0] := 0
|
|
||||||
/// FOR k = 0 TO (elements_a-1)
|
|
||||||
/// FOR n = 0 TO (elements_dest-1)
|
|
||||||
/// af := silence_snan_fp32(a.row[m].fp32[k])
|
|
||||||
/// bf := silence_snan_fp32(b.row[k].fp32[n])
|
|
||||||
/// tmp.fp32[n] += zero_lower_mantissa_bits_fp32(af)
|
|
||||||
/// * zero_lower_mantissa_bits_fp32(bf)
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
///
|
|
||||||
/// FOR n = 0 TO (elements_dest-1)
|
|
||||||
/// tmp.fp32[n] += srcdst.row[m].fp32[n]
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
|
|
||||||
///
|
|
||||||
/// ENDFOR
|
|
||||||
///
|
|
||||||
/// zero_upper_rows(srcdst, srcdst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
#define _tile_mmultf32ps(srcdst, a, b) \
|
|
||||||
__builtin_ia32_tmmultf32ps((srcdst), (a), (b))
|
|
||||||
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32
|
|
||||||
_tile_mmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_tmmultf32ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Do Matrix Multiplication of src0 and src1, and then do Matrix Plus with dst.
|
|
||||||
/// All the calculation is base on float32 but with the lower 13-bit set to 0.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_TF32
|
|
||||||
static void __tile_mmultf32ps(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_mmultf32ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
|
||||||
src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // __x86_64__
|
|
||||||
#endif // __AMX_TF32INTRIN_H
|
|
||||||
@@ -1,105 +0,0 @@
|
|||||||
/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===------------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <amxtf32transposeintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif // __IMMINTRIN_H
|
|
||||||
|
|
||||||
#ifndef __AMX_TF32TRANSPOSEINTRIN_H
|
|
||||||
#define __AMX_TF32TRANSPOSEINTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, \
|
|
||||||
__target__("amx-tf32,amx-transpose")))
|
|
||||||
|
|
||||||
/// \code
|
|
||||||
/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \
|
|
||||||
/// constexpr int b);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param srcdst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param a
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param b
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
|
|
||||||
/// dword[12:0] := 0
|
|
||||||
/// dword[31:13] := x[31:13]
|
|
||||||
/// return dword
|
|
||||||
/// }
|
|
||||||
///
|
|
||||||
/// DEFINE silence_snan_fp32(x[31:0]) {
|
|
||||||
/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
|
|
||||||
/// x.fraction[22] := 1
|
|
||||||
/// return x
|
|
||||||
/// }
|
|
||||||
///
|
|
||||||
/// elements_dest:= srcdst.colsb/4
|
|
||||||
///
|
|
||||||
/// FOR m := 0 TO (srcdst.rows-1)
|
|
||||||
/// tmp[511:0] := 0
|
|
||||||
/// FOR k := 0 TO (a.rows-1)
|
|
||||||
/// FOR n := 0 TO (elements_dest-1)
|
|
||||||
/// a1e := silence_snan_fp32(a.row[k].fp32[m])
|
|
||||||
/// a2e := silence_snan_fp32(b.row[k].fp32[n])
|
|
||||||
/// s1e := zero_lower_mantissa_bits_fp32(a1e)
|
|
||||||
/// s2e := zero_lower_mantissa_bits_fp32(a2e)
|
|
||||||
/// tmp.fp32[n] += s1e * s2e
|
|
||||||
/// ENDFOR
|
|
||||||
/// ENDFOR
|
|
||||||
///
|
|
||||||
/// FOR n := 0 TO (elements_dest-1)
|
|
||||||
/// tmp.fp32[n] += srcdst.row[m].fp32[n]
|
|
||||||
/// ENDFOR
|
|
||||||
/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
|
|
||||||
///
|
|
||||||
/// ENDFOR
|
|
||||||
///
|
|
||||||
/// zero_upper_rows(srcdst, srcdst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
#define _tile_tmmultf32ps(srcdst, a, b) \
|
|
||||||
__builtin_ia32_ttmmultf32ps((srcdst), (a), (b))
|
|
||||||
|
|
||||||
// dst = m x n (srcdest), src1 = k x m, src2 = k x n
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE
|
|
||||||
_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
|
||||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
|
||||||
return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do
|
|
||||||
/// Matrix Plus with dst. All the calculation is base on float32 but with the
|
|
||||||
/// lower 13-bit set to 0.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src0
|
|
||||||
/// The 1st source tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src1
|
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_TF32_TRANSPOSE
|
|
||||||
static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0,
|
|
||||||
__tile1024i src1) {
|
|
||||||
dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col,
|
|
||||||
dst->tile, src0.tile, src1.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // __x86_64__
|
|
||||||
#endif // __AMX_TF32TRANSPOSEINTRIN_H
|
|
||||||
@@ -1,248 +0,0 @@
|
|||||||
/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
* ===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
|
|
||||||
#endif /* __IMMINTRIN_H */
|
|
||||||
|
|
||||||
#ifndef __AMX_TRANSPOSEINTRIN_H
|
|
||||||
#define __AMX_TRANSPOSEINTRIN_H
|
|
||||||
#ifdef __x86_64__
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS_TRANSPOSE \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
|
|
||||||
|
|
||||||
#define _tile_2rpntlvwz0(tdst, base, stride) \
|
|
||||||
__builtin_ia32_t2rpntlvwz0(tdst, base, stride)
|
|
||||||
#define _tile_2rpntlvwz0t1(tdst, base, stride) \
|
|
||||||
__builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
|
|
||||||
#define _tile_2rpntlvwz1(tdst, base, stride) \
|
|
||||||
__builtin_ia32_t2rpntlvwz1(tdst, base, stride)
|
|
||||||
#define _tile_2rpntlvwz1t1(tdst, base, stride) \
|
|
||||||
__builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
|
|
||||||
|
|
||||||
/// Transpose 32-bit elements from \a src and write the result to \a dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// \code
|
|
||||||
/// void _tile_transposed(__tile dst, __tile src);
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src
|
|
||||||
/// The source tile. Max size is 1024 Bytes.
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
///
|
|
||||||
/// FOR i := 0 TO (dst.rows-1)
|
|
||||||
/// tmp[511:0] := 0
|
|
||||||
/// FOR j := 0 TO (dst.colsb/4-1)
|
|
||||||
/// tmp.dword[j] := src.row[j].dword[i]
|
|
||||||
/// ENDFOR
|
|
||||||
/// dst.row[i] := tmp
|
|
||||||
/// ENDFOR
|
|
||||||
///
|
|
||||||
/// zero_upper_rows(dst, dst.rows)
|
|
||||||
/// zero_tileconfig_start()
|
|
||||||
/// \endcode
|
|
||||||
#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
|
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
|
|
||||||
unsigned short row, unsigned short col0, unsigned short col1,
|
|
||||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
// Use __tile1024i_1024a* to escape the alignment check in
|
|
||||||
// clang/test/Headers/x86-intrinsics-headers-clean.cpp
|
|
||||||
__builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
|
|
||||||
(_tile1024i_1024a *)dst1, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
|
|
||||||
unsigned short row, unsigned short col0, unsigned short col1,
|
|
||||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
__builtin_ia32_t2rpntlvwz0t1_internal(
|
|
||||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
|
|
||||||
unsigned short row, unsigned short col0, unsigned short col1,
|
|
||||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
__builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
|
|
||||||
(_tile1024i_1024a *)dst1, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
|
|
||||||
unsigned short row, unsigned short col0, unsigned short col1,
|
|
||||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
|
||||||
__SIZE_TYPE__ stride) {
|
|
||||||
__builtin_ia32_t2rpntlvwz1t1_internal(
|
|
||||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
|
||||||
(__SIZE_TYPE__)(stride));
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
|
||||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
|
|
||||||
_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
|
|
||||||
return __builtin_ia32_ttransposed_internal(m, n, src);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
|
||||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
|
||||||
/// in memory is specified via a tsib; the second tile is after the first
|
|
||||||
/// one, separated by the same stride that separates each row.
|
|
||||||
/// The tile configuration for the destination tiles indicates the amount
|
|
||||||
/// of data to read from memory. The instruction will load a number of rows
|
|
||||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
|
||||||
/// is equal to the average width of the destination tiles. If the second
|
|
||||||
/// tile is configured with zero rows and columns, only the first tile will
|
|
||||||
/// be written.
|
|
||||||
/// Provides a hint to the implementation that the data will likely not be
|
|
||||||
/// reused in the near future and the data caching can be optimized.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst0
|
|
||||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param dst1
|
|
||||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
|
||||||
static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
|
|
||||||
const void *base, __SIZE_TYPE__ stride) {
|
|
||||||
_tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
|
||||||
&dst1->tile, base, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
|
||||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
|
||||||
/// in memory is specified via a tsib; the second tile is after the first
|
|
||||||
/// one, separated by the same stride that separates each row.
|
|
||||||
/// The tile configuration for the destination tiles indicates the amount
|
|
||||||
/// of data to read from memory. The instruction will load a number of rows
|
|
||||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
|
||||||
/// is equal to the average width of the destination tiles. If the second
|
|
||||||
/// tile is configured with zero rows and columns, only the first tile will
|
|
||||||
/// be written.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst0
|
|
||||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param dst1
|
|
||||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
|
||||||
static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
|
|
||||||
const void *base, __SIZE_TYPE__ stride) {
|
|
||||||
_tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
|
||||||
&dst1->tile, base, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
|
||||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
|
||||||
/// in memory is specified via a tsib; the second tile is after the first
|
|
||||||
/// one, separated by the same stride that separates each row.
|
|
||||||
/// The tile configuration for the destination tiles indicates the amount
|
|
||||||
/// of data to read from memory. The instruction will load a number of rows
|
|
||||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
|
||||||
/// is equal to the average width of the destination tiles. If the second
|
|
||||||
/// tile is configured with zero rows and columns, only the first tile will
|
|
||||||
/// be written. The last row will be not be read from memory but instead
|
|
||||||
/// filled with zeros.
|
|
||||||
/// Provides a hint to the implementation that the data will likely not be
|
|
||||||
/// reused in the near future and the data caching can be optimized.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst0
|
|
||||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param dst1
|
|
||||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
|
||||||
static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
|
|
||||||
const void *base, __SIZE_TYPE__ stride) {
|
|
||||||
_tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
|
||||||
&dst1->tile, base, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
|
||||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
|
||||||
/// in memory is specified via a tsib; the second tile is after the first
|
|
||||||
/// one, separated by the same stride that separates each row.
|
|
||||||
/// The tile configuration for the destination tiles indicates the amount
|
|
||||||
/// of data to read from memory. The instruction will load a number of rows
|
|
||||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
|
||||||
/// is equal to the average width of the destination tiles. If the second
|
|
||||||
/// tile is configured with zero rows and columns, only the first tile will
|
|
||||||
/// be written. The last row will be not be read from memory but instead
|
|
||||||
/// filled with zeros.
|
|
||||||
/// Provides a hint to the implementation that the data will likely not be
|
|
||||||
/// reused in the near future and the data caching can be optimized.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst0
|
|
||||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param dst1
|
|
||||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
|
||||||
/// \param base
|
|
||||||
/// A pointer to base address.
|
|
||||||
/// \param stride
|
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
|
||||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
|
||||||
static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
|
|
||||||
const void *base, __SIZE_TYPE__ stride) {
|
|
||||||
_tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
|
||||||
&dst1->tile, base, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Transpose 32-bit elements from src and write the result to dst.
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param dst
|
|
||||||
/// The destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param src
|
|
||||||
/// The source tile. Max size is 1024 Bytes.
|
|
||||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
|
||||||
static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
|
|
||||||
dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* __x86_64__ */
|
|
||||||
#endif /* __AMX_TRANSPOSEINTRIN_H */
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
//===----- andes_vector.h - Andes Vector definitions ----------------------===//
|
|
||||||
//
|
|
||||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
// See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
#ifndef _ANDES_VECTOR_H_
|
|
||||||
#define _ANDES_VECTOR_H_
|
|
||||||
|
|
||||||
#include "riscv_vector.h"
|
|
||||||
|
|
||||||
#pragma clang riscv intrinsic andes_vector
|
|
||||||
|
|
||||||
#endif //_ANDES_VECTOR_H_
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Only include this if we're compiling for the windows platform. */
|
|
||||||
#ifndef _MSC_VER
|
|
||||||
#include_next <arm64intr.h>
|
|
||||||
#else
|
|
||||||
|
|
||||||
#ifndef __ARM64INTR_H
|
|
||||||
#define __ARM64INTR_H
|
|
||||||
|
|
||||||
typedef enum
|
|
||||||
{
|
|
||||||
_ARM64_BARRIER_SY = 0xF,
|
|
||||||
_ARM64_BARRIER_ST = 0xE,
|
|
||||||
_ARM64_BARRIER_LD = 0xD,
|
|
||||||
_ARM64_BARRIER_ISH = 0xB,
|
|
||||||
_ARM64_BARRIER_ISHST = 0xA,
|
|
||||||
_ARM64_BARRIER_ISHLD = 0x9,
|
|
||||||
_ARM64_BARRIER_NSH = 0x7,
|
|
||||||
_ARM64_BARRIER_NSHST = 0x6,
|
|
||||||
_ARM64_BARRIER_NSHLD = 0x5,
|
|
||||||
_ARM64_BARRIER_OSH = 0x3,
|
|
||||||
_ARM64_BARRIER_OSHST = 0x2,
|
|
||||||
_ARM64_BARRIER_OSHLD = 0x1
|
|
||||||
} _ARM64INTR_BARRIER_TYPE;
|
|
||||||
|
|
||||||
#endif /* __ARM64INTR_H */
|
|
||||||
#endif /* _MSC_VER */
|
|
||||||
@@ -1,855 +0,0 @@
|
|||||||
/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
* The Arm C Language Extensions specifications can be found in the following
|
|
||||||
* link: https://github.com/ARM-software/acle/releases
|
|
||||||
*
|
|
||||||
* The ACLE section numbers are subject to change. When consulting the
|
|
||||||
* specifications, it is recommended to search using section titles if
|
|
||||||
* the section numbers look outdated.
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __ARM_ACLE_H
|
|
||||||
#define __ARM_ACLE_H
|
|
||||||
|
|
||||||
#ifndef __ARM_ACLE
|
|
||||||
#error "ACLE intrinsics support not enabled."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
|
|
||||||
/* 7.3 Memory barriers */
|
|
||||||
void __dmb(unsigned int);
|
|
||||||
void __dsb(unsigned int);
|
|
||||||
void __isb(unsigned int);
|
|
||||||
|
|
||||||
/* 7.4 Hints */
|
|
||||||
void __wfi(void);
|
|
||||||
void __wfe(void);
|
|
||||||
void __sev(void);
|
|
||||||
void __sevl(void);
|
|
||||||
void __yield(void);
|
|
||||||
|
|
||||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
|
||||||
#define __dbg(t) __builtin_arm_dbg(t)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
|
||||||
#define _CHKFEAT_GCS 1
|
|
||||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__chkfeat(uint64_t __features) {
|
|
||||||
return __builtin_arm_chkfeat(__features) ^ __features;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 7.5 Swap */
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__swp(uint32_t __x, volatile uint32_t *__p) {
|
|
||||||
uint32_t v;
|
|
||||||
do
|
|
||||||
v = __builtin_arm_ldrex(__p);
|
|
||||||
while (__builtin_arm_strex(__x, __p));
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 7.6 Memory prefetch intrinsics */
|
|
||||||
/* 7.6.1 Data prefetch */
|
|
||||||
#define __pld(addr) __pldx(0, 0, 0, addr)
|
|
||||||
|
|
||||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
|
||||||
#define __pldx(access_kind, cache_level, retention_policy, addr) \
|
|
||||||
__builtin_arm_prefetch(addr, access_kind, 1)
|
|
||||||
#else
|
|
||||||
#define __pldx(access_kind, cache_level, retention_policy, addr) \
|
|
||||||
__builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 7.6.2 Instruction prefetch */
|
|
||||||
#define __pli(addr) __plix(0, 0, addr)
|
|
||||||
|
|
||||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
|
||||||
#define __plix(cache_level, retention_policy, addr) \
|
|
||||||
__builtin_arm_prefetch(addr, 0, 0)
|
|
||||||
#else
|
|
||||||
#define __plix(cache_level, retention_policy, addr) \
|
|
||||||
__builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 7.7 NOP */
|
|
||||||
#if !defined(_MSC_VER) || (!defined(__aarch64__) && !defined(__arm64ec__))
|
|
||||||
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
|
|
||||||
__builtin_arm_nop();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8 DATA-PROCESSING INTRINSICS */
|
|
||||||
/* 8.2 Miscellaneous data-processing intrinsics */
|
|
||||||
/* ROR */
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__ror(uint32_t __x, uint32_t __y) {
|
|
||||||
__y %= 32;
|
|
||||||
if (__y == 0)
|
|
||||||
return __x;
|
|
||||||
return (__x >> __y) | (__x << (32 - __y));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rorll(uint64_t __x, uint32_t __y) {
|
|
||||||
__y %= 64;
|
|
||||||
if (__y == 0)
|
|
||||||
return __x;
|
|
||||||
return (__x >> __y) | (__x << (64 - __y));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rorl(unsigned long __x, uint32_t __y) {
|
|
||||||
#if __SIZEOF_LONG__ == 4
|
|
||||||
return __ror(__x, __y);
|
|
||||||
#else
|
|
||||||
return __rorll(__x, __y);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* CLZ */
|
|
||||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__clz(uint32_t __t) {
|
|
||||||
return __builtin_arm_clz(__t);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__clzl(unsigned long __t) {
|
|
||||||
#if __SIZEOF_LONG__ == 4
|
|
||||||
return __builtin_arm_clz(__t);
|
|
||||||
#else
|
|
||||||
return __builtin_arm_clz64(__t);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__clzll(uint64_t __t) {
|
|
||||||
return __builtin_arm_clz64(__t);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* CLS */
|
|
||||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__cls(uint32_t __t) {
|
|
||||||
return __builtin_arm_cls(__t);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__clsl(unsigned long __t) {
|
|
||||||
#if __SIZEOF_LONG__ == 4
|
|
||||||
return __builtin_arm_cls(__t);
|
|
||||||
#else
|
|
||||||
return __builtin_arm_cls64(__t);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__clsll(uint64_t __t) {
|
|
||||||
return __builtin_arm_cls64(__t);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* REV */
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rev(uint32_t __t) {
|
|
||||||
return __builtin_bswap32(__t);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__revl(unsigned long __t) {
|
|
||||||
#if __SIZEOF_LONG__ == 4
|
|
||||||
return __builtin_bswap32(__t);
|
|
||||||
#else
|
|
||||||
return __builtin_bswap64(__t);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__revll(uint64_t __t) {
|
|
||||||
return __builtin_bswap64(__t);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* REV16 */
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rev16(uint32_t __t) {
|
|
||||||
return __ror(__rev(__t), 16);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rev16ll(uint64_t __t) {
|
|
||||||
return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rev16l(unsigned long __t) {
|
|
||||||
#if __SIZEOF_LONG__ == 4
|
|
||||||
return __rev16(__t);
|
|
||||||
#else
|
|
||||||
return __rev16ll(__t);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* REVSH */
|
|
||||||
static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__revsh(int16_t __t) {
|
|
||||||
return (int16_t)__builtin_bswap16((uint16_t)__t);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* RBIT */
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rbit(uint32_t __t) {
|
|
||||||
return __builtin_arm_rbit(__t);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rbitll(uint64_t __t) {
|
|
||||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
|
||||||
return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
|
|
||||||
__builtin_arm_rbit(__t >> 32);
|
|
||||||
#else
|
|
||||||
return __builtin_arm_rbit64(__t);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rbitl(unsigned long __t) {
|
|
||||||
#if __SIZEOF_LONG__ == 4
|
|
||||||
return __rbit(__t);
|
|
||||||
#else
|
|
||||||
return __rbitll(__t);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 8.3 16-bit multiplications */
|
|
||||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
|
||||||
__smulbb(int32_t __a, int32_t __b) {
|
|
||||||
return __builtin_arm_smulbb(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
|
||||||
__smulbt(int32_t __a, int32_t __b) {
|
|
||||||
return __builtin_arm_smulbt(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
|
||||||
__smultb(int32_t __a, int32_t __b) {
|
|
||||||
return __builtin_arm_smultb(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
|
||||||
__smultt(int32_t __a, int32_t __b) {
|
|
||||||
return __builtin_arm_smultt(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
|
||||||
__smulwb(int32_t __a, int32_t __b) {
|
|
||||||
return __builtin_arm_smulwb(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
|
||||||
__smulwt(int32_t __a, int32_t __b) {
|
|
||||||
return __builtin_arm_smulwt(__a, __b);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 8.4 Saturating intrinsics
|
|
||||||
*
|
|
||||||
* FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
|
|
||||||
* intrinsics are implemented and the flag is enabled.
|
|
||||||
*/
|
|
||||||
/* 8.4.1 Width-specified saturation intrinsics */
|
|
||||||
#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
|
|
||||||
#define __ssat(x, y) __builtin_arm_ssat(x, y)
|
|
||||||
#define __usat(x, y) __builtin_arm_usat(x, y)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.4.2 Saturating addition and subtraction intrinsics */
|
|
||||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
|
||||||
__qadd(int32_t __t, int32_t __v) {
|
|
||||||
return __builtin_arm_qadd(__t, __v);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
|
||||||
__qsub(int32_t __t, int32_t __v) {
|
|
||||||
return __builtin_arm_qsub(__t, __v);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
|
||||||
__qdbl(int32_t __t) {
|
|
||||||
return __builtin_arm_qadd(__t, __t);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.4.3 Accumulating multiplications */
|
|
||||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
|
||||||
__smlabb(int32_t __a, int32_t __b, int32_t __c) {
|
|
||||||
return __builtin_arm_smlabb(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
|
||||||
__smlabt(int32_t __a, int32_t __b, int32_t __c) {
|
|
||||||
return __builtin_arm_smlabt(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
|
||||||
__smlatb(int32_t __a, int32_t __b, int32_t __c) {
|
|
||||||
return __builtin_arm_smlatb(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
|
||||||
__smlatt(int32_t __a, int32_t __b, int32_t __c) {
|
|
||||||
return __builtin_arm_smlatt(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
|
||||||
__smlawb(int32_t __a, int32_t __b, int32_t __c) {
|
|
||||||
return __builtin_arm_smlawb(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
|
||||||
__smlawt(int32_t __a, int32_t __b, int32_t __c) {
|
|
||||||
return __builtin_arm_smlawt(__a, __b, __c);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/* 8.5.4 Parallel 16-bit saturation */
|
|
||||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
|
||||||
#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
|
|
||||||
#define __usat16(x, y) __builtin_arm_usat16(x, y)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.5.5 Packing and unpacking */
|
|
||||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
|
||||||
typedef int32_t int8x4_t;
|
|
||||||
typedef int32_t int16x2_t;
|
|
||||||
typedef uint32_t uint8x4_t;
|
|
||||||
typedef uint32_t uint16x2_t;
|
|
||||||
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__sxtab16(int16x2_t __a, int8x4_t __b) {
|
|
||||||
return __builtin_arm_sxtab16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__sxtb16(int8x4_t __a) {
|
|
||||||
return __builtin_arm_sxtb16(__a);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uxtab16(int16x2_t __a, int8x4_t __b) {
|
|
||||||
return __builtin_arm_uxtab16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uxtb16(int8x4_t __a) {
|
|
||||||
return __builtin_arm_uxtb16(__a);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.5.6 Parallel selection */
|
|
||||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
|
||||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__sel(uint8x4_t __a, uint8x4_t __b) {
|
|
||||||
return __builtin_arm_sel(__a, __b);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.5.7 Parallel 8-bit addition and subtraction */
|
|
||||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
|
||||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__qadd8(int8x4_t __a, int8x4_t __b) {
|
|
||||||
return __builtin_arm_qadd8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__qsub8(int8x4_t __a, int8x4_t __b) {
|
|
||||||
return __builtin_arm_qsub8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__sadd8(int8x4_t __a, int8x4_t __b) {
|
|
||||||
return __builtin_arm_sadd8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__shadd8(int8x4_t __a, int8x4_t __b) {
|
|
||||||
return __builtin_arm_shadd8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__shsub8(int8x4_t __a, int8x4_t __b) {
|
|
||||||
return __builtin_arm_shsub8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__ssub8(int8x4_t __a, int8x4_t __b) {
|
|
||||||
return __builtin_arm_ssub8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uadd8(uint8x4_t __a, uint8x4_t __b) {
|
|
||||||
return __builtin_arm_uadd8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uhadd8(uint8x4_t __a, uint8x4_t __b) {
|
|
||||||
return __builtin_arm_uhadd8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uhsub8(uint8x4_t __a, uint8x4_t __b) {
|
|
||||||
return __builtin_arm_uhsub8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uqadd8(uint8x4_t __a, uint8x4_t __b) {
|
|
||||||
return __builtin_arm_uqadd8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uqsub8(uint8x4_t __a, uint8x4_t __b) {
|
|
||||||
return __builtin_arm_uqsub8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__usub8(uint8x4_t __a, uint8x4_t __b) {
|
|
||||||
return __builtin_arm_usub8(__a, __b);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.5.8 Sum of 8-bit absolute differences */
|
|
||||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__usad8(uint8x4_t __a, uint8x4_t __b) {
|
|
||||||
return __builtin_arm_usad8(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
|
|
||||||
return __builtin_arm_usada8(__a, __b, __c);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.5.9 Parallel 16-bit addition and subtraction */
|
|
||||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__qadd16(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_qadd16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__qasx(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_qasx(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__qsax(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_qsax(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__qsub16(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_qsub16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__sadd16(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_sadd16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__sasx(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_sasx(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__shadd16(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_shadd16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__shasx(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_shasx(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__shsax(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_shsax(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__shsub16(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_shsub16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__ssax(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_ssax(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__ssub16(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_ssub16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uadd16(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_uadd16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uasx(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_uasx(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uhadd16(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_uhadd16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uhasx(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_uhasx(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uhsax(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_uhsax(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uhsub16(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_uhsub16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uqadd16(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_uqadd16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uqasx(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_uqasx(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uqsax(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_uqsax(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__uqsub16(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_uqsub16(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__usax(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_usax(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__usub16(uint16x2_t __a, uint16x2_t __b) {
|
|
||||||
return __builtin_arm_usub16(__a, __b);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.5.10 Parallel 16-bit multiplication */
|
|
||||||
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
|
||||||
return __builtin_arm_smlad(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
|
||||||
return __builtin_arm_smladx(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
|
||||||
return __builtin_arm_smlald(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
|
||||||
return __builtin_arm_smlaldx(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
|
||||||
return __builtin_arm_smlsd(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
|
||||||
return __builtin_arm_smlsdx(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
|
||||||
return __builtin_arm_smlsld(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
|
||||||
return __builtin_arm_smlsldx(__a, __b, __c);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smuad(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_smuad(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smuadx(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_smuadx(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smusd(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_smusd(__a, __b);
|
|
||||||
}
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__smusdx(int16x2_t __a, int16x2_t __b) {
|
|
||||||
return __builtin_arm_smusdx(__a, __b);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.6 Floating-point data-processing intrinsics */
|
|
||||||
#if (defined(__ARM_FEATURE_DIRECTED_ROUNDING) && \
|
|
||||||
(__ARM_FEATURE_DIRECTED_ROUNDING)) && \
|
|
||||||
(defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
|
|
||||||
static __inline__ double __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rintn(double __a) {
|
|
||||||
return __builtin_roundeven(__a);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ float __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__rintnf(float __a) {
|
|
||||||
return __builtin_roundevenf(__a);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.8 CRC32 intrinsics */
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
|
||||||
__crc32b(uint32_t __a, uint8_t __b) {
|
|
||||||
return __builtin_arm_crc32b(__a, __b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
|
||||||
__crc32h(uint32_t __a, uint16_t __b) {
|
|
||||||
return __builtin_arm_crc32h(__a, __b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
|
||||||
__crc32w(uint32_t __a, uint32_t __b) {
|
|
||||||
return __builtin_arm_crc32w(__a, __b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
|
||||||
__crc32d(uint32_t __a, uint64_t __b) {
|
|
||||||
return __builtin_arm_crc32d(__a, __b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
|
||||||
__crc32cb(uint32_t __a, uint8_t __b) {
|
|
||||||
return __builtin_arm_crc32cb(__a, __b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
|
||||||
__crc32ch(uint32_t __a, uint16_t __b) {
|
|
||||||
return __builtin_arm_crc32ch(__a, __b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
|
||||||
__crc32cw(uint32_t __a, uint32_t __b) {
|
|
||||||
return __builtin_arm_crc32cw(__a, __b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
|
||||||
__crc32cd(uint32_t __a, uint64_t __b) {
|
|
||||||
return __builtin_arm_crc32cd(__a, __b);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 8.6 Floating-point data-processing intrinsics */
|
|
||||||
/* Armv8.3-A Javascript conversion intrinsic */
|
|
||||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
|
||||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
|
|
||||||
__jcvt(double __a) {
|
|
||||||
return __builtin_arm_jcvt(__a);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Armv8.5-A FP rounding intrinsics */
|
|
||||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
|
||||||
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
|
||||||
__rint32zf(float __a) {
|
|
||||||
return __builtin_arm_rint32zf(__a);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
|
||||||
__rint32z(double __a) {
|
|
||||||
return __builtin_arm_rint32z(__a);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
|
||||||
__rint64zf(float __a) {
|
|
||||||
return __builtin_arm_rint64zf(__a);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
|
||||||
__rint64z(double __a) {
|
|
||||||
return __builtin_arm_rint64z(__a);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
|
||||||
__rint32xf(float __a) {
|
|
||||||
return __builtin_arm_rint32xf(__a);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
|
||||||
__rint32x(double __a) {
|
|
||||||
return __builtin_arm_rint32x(__a);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
|
||||||
__rint64xf(float __a) {
|
|
||||||
return __builtin_arm_rint64xf(__a);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
|
||||||
__rint64x(double __a) {
|
|
||||||
return __builtin_arm_rint64x(__a);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 8.9 Armv8.7-A load/store 64-byte intrinsics */
|
|
||||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
|
||||||
typedef struct {
|
|
||||||
uint64_t val[8];
|
|
||||||
} data512_t;
|
|
||||||
|
|
||||||
static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
|
||||||
__arm_ld64b(const void *__addr) {
|
|
||||||
data512_t __value;
|
|
||||||
__builtin_arm_ld64b(__addr, __value.val);
|
|
||||||
return __value;
|
|
||||||
}
|
|
||||||
static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
|
||||||
__arm_st64b(void *__addr, data512_t __value) {
|
|
||||||
__builtin_arm_st64b(__addr, __value.val);
|
|
||||||
}
|
|
||||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
|
||||||
__arm_st64bv(void *__addr, data512_t __value) {
|
|
||||||
return __builtin_arm_st64bv(__addr, __value.val);
|
|
||||||
}
|
|
||||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
|
||||||
__arm_st64bv0(void *__addr, data512_t __value) {
|
|
||||||
return __builtin_arm_st64bv0(__addr, __value.val);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 11.1 Special register intrinsics */
|
|
||||||
#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
|
|
||||||
#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
|
|
||||||
#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
|
|
||||||
#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
|
|
||||||
#define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
|
|
||||||
#define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
|
|
||||||
#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
|
|
||||||
#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
|
|
||||||
#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
|
|
||||||
#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
|
|
||||||
#define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
|
|
||||||
#define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
|
|
||||||
|
|
||||||
/* 10.3 MTE intrinsics */
|
|
||||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
|
||||||
#define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask)
|
|
||||||
#define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset)
|
|
||||||
#define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded)
|
|
||||||
#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
|
|
||||||
#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
|
|
||||||
#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
|
|
||||||
|
|
||||||
/* 18 memcpy family of operations intrinsics - MOPS */
|
|
||||||
#define __arm_mops_memset_tag(__tagged_address, __value, __size) \
|
|
||||||
__builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 11.3 Coprocessor Intrinsics */
|
|
||||||
#if defined(__ARM_FEATURE_COPROC)
|
|
||||||
|
|
||||||
#if (__ARM_FEATURE_COPROC & 0x1)
|
|
||||||
|
|
||||||
#if (__ARM_ARCH < 8)
|
|
||||||
#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \
|
|
||||||
__builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
|
|
||||||
#endif /* __ARM_ARCH < 8 */
|
|
||||||
|
|
||||||
#define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p)
|
|
||||||
#define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p)
|
|
||||||
|
|
||||||
#define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2) \
|
|
||||||
__builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2)
|
|
||||||
#define __arm_mrc(coproc, opc1, CRn, CRm, opc2) \
|
|
||||||
__builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2)
|
|
||||||
|
|
||||||
#if (__ARM_ARCH != 4) && (__ARM_ARCH < 8)
|
|
||||||
#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
|
|
||||||
#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
|
|
||||||
#endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */
|
|
||||||
|
|
||||||
#if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__)
|
|
||||||
#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \
|
|
||||||
__builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
|
|
||||||
#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
|
|
||||||
#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
|
|
||||||
#endif /* ___ARM_ARCH_8M_MAIN__ */
|
|
||||||
|
|
||||||
#endif /* __ARM_FEATURE_COPROC & 0x1 */
|
|
||||||
|
|
||||||
#if (__ARM_FEATURE_COPROC & 0x2)
|
|
||||||
#define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2) \
|
|
||||||
__builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)
|
|
||||||
#define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p)
|
|
||||||
#define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p)
|
|
||||||
#define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p)
|
|
||||||
#define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p)
|
|
||||||
#define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2) \
|
|
||||||
__builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)
|
|
||||||
#define __arm_mrc2(coproc, opc1, CRn, CRm, opc2) \
|
|
||||||
__builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if (__ARM_FEATURE_COPROC & 0x4)
|
|
||||||
#define __arm_mcrr(coproc, opc1, value, CRm) \
|
|
||||||
__builtin_arm_mcrr(coproc, opc1, value, CRm)
|
|
||||||
#define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if (__ARM_FEATURE_COPROC & 0x8)
|
|
||||||
#define __arm_mcrr2(coproc, opc1, value, CRm) \
|
|
||||||
__builtin_arm_mcrr2(coproc, opc1, value, CRm)
|
|
||||||
#define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // __ARM_FEATURE_COPROC
|
|
||||||
|
|
||||||
/* 17 Transactional Memory Extension (TME) Intrinsics */
|
|
||||||
#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME
|
|
||||||
|
|
||||||
#define _TMFAILURE_REASON 0x00007fffu
|
|
||||||
#define _TMFAILURE_RTRY 0x00008000u
|
|
||||||
#define _TMFAILURE_CNCL 0x00010000u
|
|
||||||
#define _TMFAILURE_MEM 0x00020000u
|
|
||||||
#define _TMFAILURE_IMP 0x00040000u
|
|
||||||
#define _TMFAILURE_ERR 0x00080000u
|
|
||||||
#define _TMFAILURE_SIZE 0x00100000u
|
|
||||||
#define _TMFAILURE_NEST 0x00200000u
|
|
||||||
#define _TMFAILURE_DBG 0x00400000u
|
|
||||||
#define _TMFAILURE_INT 0x00800000u
|
|
||||||
#define _TMFAILURE_TRIVIAL 0x01000000u
|
|
||||||
|
|
||||||
#define __tstart() __builtin_arm_tstart()
|
|
||||||
#define __tcommit() __builtin_arm_tcommit()
|
|
||||||
#define __tcancel(__arg) __builtin_arm_tcancel(__arg)
|
|
||||||
#define __ttest() __builtin_arm_ttest()
|
|
||||||
|
|
||||||
#endif /* __ARM_FEATURE_TME */
|
|
||||||
|
|
||||||
/* 8.7 Armv8.5-A Random number generation intrinsics */
|
|
||||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
|
||||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
|
|
||||||
__rndr(uint64_t *__p) {
|
|
||||||
return __builtin_arm_rndr(__p);
|
|
||||||
}
|
|
||||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
|
|
||||||
__rndrrs(uint64_t *__p) {
|
|
||||||
return __builtin_arm_rndrrs(__p);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* 11.2 Guarded Control Stack intrinsics */
|
|
||||||
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
|
||||||
static __inline__ void * __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__gcspr() {
|
|
||||||
return (void *)__builtin_arm_rsr64("gcspr_el0");
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("gcs")))
|
|
||||||
__gcspopm() {
|
|
||||||
return __builtin_arm_gcspopm(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ void *__attribute__((__always_inline__, __nodebug__,
|
|
||||||
target("gcs")))
|
|
||||||
__gcsss(void *__stack) {
|
|
||||||
return __builtin_arm_gcsss(__stack);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* __ARM_ACLE_H */
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
/*===---- arm_bf16.h - ARM BF16 intrinsics -----------------------------------===
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __ARM_BF16_H
|
|
||||||
#define __ARM_BF16_H
|
|
||||||
|
|
||||||
typedef __bf16 bfloat16_t;
|
|
||||||
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
|
|
||||||
|
|
||||||
|
|
||||||
#undef __ai
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -1,410 +0,0 @@
|
|||||||
/*===---- arm_cde.h - ARM CDE intrinsics -----------------------------------===
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __ARM_CDE_H
|
|
||||||
#define __ARM_CDE_H
|
|
||||||
|
|
||||||
#if !__ARM_FEATURE_CDE
|
|
||||||
#error "CDE support not enabled"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1)))
|
|
||||||
uint32_t __arm_cx1(int, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1a)))
|
|
||||||
uint32_t __arm_cx1a(int, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1d)))
|
|
||||||
uint64_t __arm_cx1d(int, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1da)))
|
|
||||||
uint64_t __arm_cx1da(int, uint64_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2)))
|
|
||||||
uint32_t __arm_cx2(int, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2a)))
|
|
||||||
uint32_t __arm_cx2a(int, uint32_t, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2d)))
|
|
||||||
uint64_t __arm_cx2d(int, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2da)))
|
|
||||||
uint64_t __arm_cx2da(int, uint64_t, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3)))
|
|
||||||
uint32_t __arm_cx3(int, uint32_t, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3a)))
|
|
||||||
uint32_t __arm_cx3a(int, uint32_t, uint32_t, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3d)))
|
|
||||||
uint64_t __arm_cx3d(int, uint32_t, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3da)))
|
|
||||||
uint64_t __arm_cx3da(int, uint64_t, uint32_t, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1_u32)))
|
|
||||||
uint32_t __arm_vcx1_u32(int, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1a_u32)))
|
|
||||||
uint32_t __arm_vcx1a_u32(int, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1d_u64)))
|
|
||||||
uint64_t __arm_vcx1d_u64(int, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1da_u64)))
|
|
||||||
uint64_t __arm_vcx1da_u64(int, uint64_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2_u32)))
|
|
||||||
uint32_t __arm_vcx2_u32(int, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2a_u32)))
|
|
||||||
uint32_t __arm_vcx2a_u32(int, uint32_t, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2d_u64)))
|
|
||||||
uint64_t __arm_vcx2d_u64(int, uint64_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2da_u64)))
|
|
||||||
uint64_t __arm_vcx2da_u64(int, uint64_t, uint64_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3_u32)))
|
|
||||||
uint32_t __arm_vcx3_u32(int, uint32_t, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3a_u32)))
|
|
||||||
uint32_t __arm_vcx3a_u32(int, uint32_t, uint32_t, uint32_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3d_u64)))
|
|
||||||
uint64_t __arm_vcx3d_u64(int, uint64_t, uint64_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3da_u64)))
|
|
||||||
uint64_t __arm_vcx3da_u64(int, uint64_t, uint64_t, uint64_t, uint32_t);
|
|
||||||
|
|
||||||
#if __ARM_FEATURE_MVE
|
|
||||||
|
|
||||||
typedef uint16_t mve_pred16_t;
|
|
||||||
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) int16_t int16x8_t;
|
|
||||||
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) int32_t int32x4_t;
|
|
||||||
typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) int64_t int64x2_t;
|
|
||||||
typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) int8_t int8x16_t;
|
|
||||||
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) uint16_t uint16x8_t;
|
|
||||||
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) uint32_t uint32x4_t;
|
|
||||||
typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) uint64_t uint64x2_t;
|
|
||||||
typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) uint8_t uint8x16_t;
|
|
||||||
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s16)))
|
|
||||||
int16x8_t __arm_vcx1q_m(int, int16x8_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s32)))
|
|
||||||
int32x4_t __arm_vcx1q_m(int, int32x4_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s64)))
|
|
||||||
int64x2_t __arm_vcx1q_m(int, int64x2_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s8)))
|
|
||||||
int8x16_t __arm_vcx1q_m(int, int8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u16)))
|
|
||||||
uint16x8_t __arm_vcx1q_m(int, uint16x8_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u32)))
|
|
||||||
uint32x4_t __arm_vcx1q_m(int, uint32x4_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u64)))
|
|
||||||
uint64x2_t __arm_vcx1q_m(int, uint64x2_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u8)))
|
|
||||||
uint8x16_t __arm_vcx1q_m(int, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_u8)))
|
|
||||||
uint8x16_t __arm_vcx1q_u8(int, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s16)))
|
|
||||||
int16x8_t __arm_vcx1qa_m(int, int16x8_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s32)))
|
|
||||||
int32x4_t __arm_vcx1qa_m(int, int32x4_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s64)))
|
|
||||||
int64x2_t __arm_vcx1qa_m(int, int64x2_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s8)))
|
|
||||||
int8x16_t __arm_vcx1qa_m(int, int8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u16)))
|
|
||||||
uint16x8_t __arm_vcx1qa_m(int, uint16x8_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u32)))
|
|
||||||
uint32x4_t __arm_vcx1qa_m(int, uint32x4_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u64)))
|
|
||||||
uint64x2_t __arm_vcx1qa_m(int, uint64x2_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u8)))
|
|
||||||
uint8x16_t __arm_vcx1qa_m(int, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s16)))
|
|
||||||
int16x8_t __arm_vcx1qa(int, int16x8_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s32)))
|
|
||||||
int32x4_t __arm_vcx1qa(int, int32x4_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s64)))
|
|
||||||
int64x2_t __arm_vcx1qa(int, int64x2_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s8)))
|
|
||||||
int8x16_t __arm_vcx1qa(int, int8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u16)))
|
|
||||||
uint16x8_t __arm_vcx1qa(int, uint16x8_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u32)))
|
|
||||||
uint32x4_t __arm_vcx1qa(int, uint32x4_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u64)))
|
|
||||||
uint64x2_t __arm_vcx1qa(int, uint64x2_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u8)))
|
|
||||||
uint8x16_t __arm_vcx1qa(int, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s16)))
|
|
||||||
int16x8_t __arm_vcx2q_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s32)))
|
|
||||||
int32x4_t __arm_vcx2q_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s64)))
|
|
||||||
int64x2_t __arm_vcx2q_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s8)))
|
|
||||||
int8x16_t __arm_vcx2q_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u16)))
|
|
||||||
uint16x8_t __arm_vcx2q_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u32)))
|
|
||||||
uint32x4_t __arm_vcx2q_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u64)))
|
|
||||||
uint64x2_t __arm_vcx2q_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u8)))
|
|
||||||
uint8x16_t __arm_vcx2q_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s16)))
|
|
||||||
int16x8_t __arm_vcx2q(int, int16x8_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s32)))
|
|
||||||
int32x4_t __arm_vcx2q(int, int32x4_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s64)))
|
|
||||||
int64x2_t __arm_vcx2q(int, int64x2_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s8)))
|
|
||||||
int8x16_t __arm_vcx2q(int, int8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u16)))
|
|
||||||
uint16x8_t __arm_vcx2q(int, uint16x8_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u32)))
|
|
||||||
uint32x4_t __arm_vcx2q(int, uint32x4_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u64)))
|
|
||||||
uint64x2_t __arm_vcx2q(int, uint64x2_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8)))
|
|
||||||
uint8x16_t __arm_vcx2q(int, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s16)))
|
|
||||||
uint8x16_t __arm_vcx2q_u8(int, int16x8_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s32)))
|
|
||||||
uint8x16_t __arm_vcx2q_u8(int, int32x4_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s64)))
|
|
||||||
uint8x16_t __arm_vcx2q_u8(int, int64x2_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s8)))
|
|
||||||
uint8x16_t __arm_vcx2q_u8(int, int8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u16)))
|
|
||||||
uint8x16_t __arm_vcx2q_u8(int, uint16x8_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u32)))
|
|
||||||
uint8x16_t __arm_vcx2q_u8(int, uint32x4_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u64)))
|
|
||||||
uint8x16_t __arm_vcx2q_u8(int, uint64x2_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u8)))
|
|
||||||
uint8x16_t __arm_vcx2q_u8(int, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s16)))
|
|
||||||
int16x8_t __arm_vcx2qa_impl(int, int16x8_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s32)))
|
|
||||||
int32x4_t __arm_vcx2qa_impl(int, int32x4_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s64)))
|
|
||||||
int64x2_t __arm_vcx2qa_impl(int, int64x2_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s8)))
|
|
||||||
int8x16_t __arm_vcx2qa_impl(int, int8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u16)))
|
|
||||||
uint16x8_t __arm_vcx2qa_impl(int, uint16x8_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u32)))
|
|
||||||
uint32x4_t __arm_vcx2qa_impl(int, uint32x4_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u64)))
|
|
||||||
uint64x2_t __arm_vcx2qa_impl(int, uint64x2_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u8)))
|
|
||||||
uint8x16_t __arm_vcx2qa_impl(int, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s16)))
|
|
||||||
int16x8_t __arm_vcx2qa_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s32)))
|
|
||||||
int32x4_t __arm_vcx2qa_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s64)))
|
|
||||||
int64x2_t __arm_vcx2qa_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s8)))
|
|
||||||
int8x16_t __arm_vcx2qa_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u16)))
|
|
||||||
uint16x8_t __arm_vcx2qa_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u32)))
|
|
||||||
uint32x4_t __arm_vcx2qa_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u64)))
|
|
||||||
uint64x2_t __arm_vcx2qa_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u8)))
|
|
||||||
uint8x16_t __arm_vcx2qa_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s16)))
|
|
||||||
int16x8_t __arm_vcx3q_impl(int, int16x8_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s32)))
|
|
||||||
int32x4_t __arm_vcx3q_impl(int, int32x4_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s64)))
|
|
||||||
int64x2_t __arm_vcx3q_impl(int, int64x2_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s8)))
|
|
||||||
int8x16_t __arm_vcx3q_impl(int, int8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u16)))
|
|
||||||
uint16x8_t __arm_vcx3q_impl(int, uint16x8_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u32)))
|
|
||||||
uint32x4_t __arm_vcx3q_impl(int, uint32x4_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u64)))
|
|
||||||
uint64x2_t __arm_vcx3q_impl(int, uint64x2_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u8)))
|
|
||||||
uint8x16_t __arm_vcx3q_impl(int, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s16)))
|
|
||||||
int16x8_t __arm_vcx3q_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s32)))
|
|
||||||
int32x4_t __arm_vcx3q_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s64)))
|
|
||||||
int64x2_t __arm_vcx3q_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s8)))
|
|
||||||
int8x16_t __arm_vcx3q_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u16)))
|
|
||||||
uint16x8_t __arm_vcx3q_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u32)))
|
|
||||||
uint32x4_t __arm_vcx3q_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u64)))
|
|
||||||
uint64x2_t __arm_vcx3q_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u8)))
|
|
||||||
uint8x16_t __arm_vcx3q_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s16)))
|
|
||||||
uint8x16_t __arm_vcx3q_u8_impl(int, int16x8_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s32)))
|
|
||||||
uint8x16_t __arm_vcx3q_u8_impl(int, int32x4_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s64)))
|
|
||||||
uint8x16_t __arm_vcx3q_u8_impl(int, int64x2_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s8)))
|
|
||||||
uint8x16_t __arm_vcx3q_u8_impl(int, int8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u16)))
|
|
||||||
uint8x16_t __arm_vcx3q_u8_impl(int, uint16x8_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u32)))
|
|
||||||
uint8x16_t __arm_vcx3q_u8_impl(int, uint32x4_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u64)))
|
|
||||||
uint8x16_t __arm_vcx3q_u8_impl(int, uint64x2_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u8)))
|
|
||||||
uint8x16_t __arm_vcx3q_u8_impl(int, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s16)))
|
|
||||||
int16x8_t __arm_vcx3qa_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s32)))
|
|
||||||
int32x4_t __arm_vcx3qa_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s64)))
|
|
||||||
int64x2_t __arm_vcx3qa_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s8)))
|
|
||||||
int8x16_t __arm_vcx3qa_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u16)))
|
|
||||||
uint16x8_t __arm_vcx3qa_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u32)))
|
|
||||||
uint32x4_t __arm_vcx3qa_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u64)))
|
|
||||||
uint64x2_t __arm_vcx3qa_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u8)))
|
|
||||||
uint8x16_t __arm_vcx3qa_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s16)))
|
|
||||||
int16x8_t __arm_vcx3qa_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s32)))
|
|
||||||
int32x4_t __arm_vcx3qa_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s64)))
|
|
||||||
int64x2_t __arm_vcx3qa_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s8)))
|
|
||||||
int8x16_t __arm_vcx3qa_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u16)))
|
|
||||||
uint16x8_t __arm_vcx3qa_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u32)))
|
|
||||||
uint32x4_t __arm_vcx3qa_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u64)))
|
|
||||||
uint64x2_t __arm_vcx3qa_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u8)))
|
|
||||||
uint8x16_t __arm_vcx3qa_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
|
|
||||||
int16x8_t __arm_vreinterpretq_s16_u8(uint8x16_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
|
|
||||||
int32x4_t __arm_vreinterpretq_s32_u8(uint8x16_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
|
|
||||||
int64x2_t __arm_vreinterpretq_s64_u8(uint8x16_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
|
|
||||||
int8x16_t __arm_vreinterpretq_s8_u8(uint8x16_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
|
|
||||||
uint16x8_t __arm_vreinterpretq_u16_u8(uint8x16_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
|
|
||||||
uint32x4_t __arm_vreinterpretq_u32_u8(uint8x16_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
|
|
||||||
uint64x2_t __arm_vreinterpretq_u64_u8(uint8x16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
|
|
||||||
uint8x16_t __arm_vreinterpretq_u8(int16x8_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
|
|
||||||
uint8x16_t __arm_vreinterpretq_u8(int32x4_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
|
|
||||||
uint8x16_t __arm_vreinterpretq_u8(int64x2_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
|
|
||||||
uint8x16_t __arm_vreinterpretq_u8(int8x16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
|
|
||||||
uint8x16_t __arm_vreinterpretq_u8(uint16x8_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
|
|
||||||
uint8x16_t __arm_vreinterpretq_u8(uint32x4_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
|
|
||||||
uint8x16_t __arm_vreinterpretq_u8(uint64x2_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vreinterpretq_u8_u8)))
|
|
||||||
uint8x16_t __arm_vreinterpretq_u8(uint8x16_t);
|
|
||||||
#define __arm_vcx2q_m(cp, inactive, n, imm, pred) __arm_vcx2q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), (imm), (pred))
|
|
||||||
#define __arm_vcx2qa(cp, acc, n, imm) __arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))
|
|
||||||
#define __arm_vcx2qa_m(cp, acc, n, imm, pred) __arm_vcx2qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm), (pred))
|
|
||||||
#define __arm_vcx3q(cp, n, m, imm) __arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
|
|
||||||
#define __arm_vcx3q_m(cp, inactive, n, m, imm, pred) __arm_vcx3q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
|
|
||||||
#define __arm_vcx3q_u8(cp, n, m, imm) __arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
|
|
||||||
#define __arm_vcx3qa(cp, acc, n, m, imm) __arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm))
|
|
||||||
#define __arm_vcx3qa_m(cp, acc, n, m, imm, pred) __arm_vcx3qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
|
|
||||||
|
|
||||||
#endif /* __ARM_FEATURE_MVE */
|
|
||||||
|
|
||||||
#if __ARM_FEATURE_MVE & 2
|
|
||||||
|
|
||||||
typedef __fp16 float16_t;
|
|
||||||
typedef float float32_t;
|
|
||||||
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) float16_t float16x8_t;
|
|
||||||
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) float32_t float32x4_t;
|
|
||||||
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f16)))
|
|
||||||
float16x8_t __arm_vcx1q_m(int, float16x8_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f32)))
|
|
||||||
float32x4_t __arm_vcx1q_m(int, float32x4_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f16)))
|
|
||||||
float16x8_t __arm_vcx1qa(int, float16x8_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f32)))
|
|
||||||
float32x4_t __arm_vcx1qa(int, float32x4_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f16)))
|
|
||||||
float16x8_t __arm_vcx1qa_m(int, float16x8_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f32)))
|
|
||||||
float32x4_t __arm_vcx1qa_m(int, float32x4_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f16)))
|
|
||||||
float16x8_t __arm_vcx2q(int, float16x8_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f32)))
|
|
||||||
float32x4_t __arm_vcx2q(int, float32x4_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f16)))
|
|
||||||
float16x8_t __arm_vcx2q_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f32)))
|
|
||||||
float32x4_t __arm_vcx2q_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f16)))
|
|
||||||
uint8x16_t __arm_vcx2q_u8(int, float16x8_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f32)))
|
|
||||||
uint8x16_t __arm_vcx2q_u8(int, float32x4_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f16)))
|
|
||||||
float16x8_t __arm_vcx2qa_impl(int, float16x8_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f32)))
|
|
||||||
float32x4_t __arm_vcx2qa_impl(int, float32x4_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f16)))
|
|
||||||
float16x8_t __arm_vcx2qa_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f32)))
|
|
||||||
float32x4_t __arm_vcx2qa_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f16)))
|
|
||||||
float16x8_t __arm_vcx3q_impl(int, float16x8_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f32)))
|
|
||||||
float32x4_t __arm_vcx3q_impl(int, float32x4_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f16)))
|
|
||||||
float16x8_t __arm_vcx3q_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f32)))
|
|
||||||
float32x4_t __arm_vcx3q_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f16)))
|
|
||||||
uint8x16_t __arm_vcx3q_u8_impl(int, float16x8_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f32)))
|
|
||||||
uint8x16_t __arm_vcx3q_u8_impl(int, float32x4_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f16)))
|
|
||||||
float16x8_t __arm_vcx3qa_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f32)))
|
|
||||||
float32x4_t __arm_vcx3qa_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f16)))
|
|
||||||
float16x8_t __arm_vcx3qa_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f32)))
|
|
||||||
float32x4_t __arm_vcx3qa_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
|
|
||||||
float16x8_t __arm_vreinterpretq_f16_u8(uint8x16_t);
|
|
||||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
|
|
||||||
float32x4_t __arm_vreinterpretq_f32_u8(uint8x16_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
|
|
||||||
uint8x16_t __arm_vreinterpretq_u8(float16x8_t);
|
|
||||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
|
|
||||||
uint8x16_t __arm_vreinterpretq_u8(float32x4_t);
|
|
||||||
|
|
||||||
#endif /* __ARM_FEATURE_MVE & 2 */
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
} /* extern "C" */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* __ARM_CDE_H */
|
|
||||||
@@ -1,217 +0,0 @@
|
|||||||
//===---- arm_cmse.h - Arm CMSE support -----------------------------------===//
|
|
||||||
//
|
|
||||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
// See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
#ifndef __ARM_CMSE_H
|
|
||||||
#define __ARM_CMSE_H
|
|
||||||
|
|
||||||
#if (__ARM_FEATURE_CMSE & 0x1)
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#define __ARM_CMSE_SECURE_MODE (__ARM_FEATURE_CMSE & 0x2)
|
|
||||||
#define CMSE_MPU_READWRITE 1 /* checks if readwrite_ok field is set */
|
|
||||||
#define CMSE_AU_NONSECURE 2 /* checks if permissions have secure field unset */
|
|
||||||
#define CMSE_MPU_UNPRIV 4 /* sets T flag on TT insrtuction */
|
|
||||||
#define CMSE_MPU_READ 8 /* checks if read_ok field is set */
|
|
||||||
#define CMSE_MPU_NONSECURE 16 /* sets A flag, checks if secure field unset */
|
|
||||||
#define CMSE_NONSECURE (CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE)
|
|
||||||
|
|
||||||
#define cmse_check_pointed_object(p, f) \
|
|
||||||
cmse_check_address_range((p), sizeof(*(p)), (f))
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef union {
|
|
||||||
struct cmse_address_info {
|
|
||||||
#ifdef __ARM_BIG_ENDIAN
|
|
||||||
/* __ARM_BIG_ENDIAN */
|
|
||||||
#if (__ARM_CMSE_SECURE_MODE)
|
|
||||||
unsigned idau_region : 8;
|
|
||||||
unsigned idau_region_valid : 1;
|
|
||||||
unsigned secure : 1;
|
|
||||||
unsigned nonsecure_readwrite_ok : 1;
|
|
||||||
unsigned nonsecure_read_ok : 1;
|
|
||||||
#else
|
|
||||||
unsigned : 12;
|
|
||||||
#endif
|
|
||||||
unsigned readwrite_ok : 1;
|
|
||||||
unsigned read_ok : 1;
|
|
||||||
#if (__ARM_CMSE_SECURE_MODE)
|
|
||||||
unsigned sau_region_valid : 1;
|
|
||||||
#else
|
|
||||||
unsigned : 1;
|
|
||||||
#endif
|
|
||||||
unsigned mpu_region_valid : 1;
|
|
||||||
#if (__ARM_CMSE_SECURE_MODE)
|
|
||||||
unsigned sau_region : 8;
|
|
||||||
#else
|
|
||||||
unsigned : 8;
|
|
||||||
#endif
|
|
||||||
unsigned mpu_region : 8;
|
|
||||||
|
|
||||||
#else /* __ARM_LITTLE_ENDIAN */
|
|
||||||
unsigned mpu_region : 8;
|
|
||||||
#if (__ARM_CMSE_SECURE_MODE)
|
|
||||||
unsigned sau_region : 8;
|
|
||||||
#else
|
|
||||||
unsigned : 8;
|
|
||||||
#endif
|
|
||||||
unsigned mpu_region_valid : 1;
|
|
||||||
#if (__ARM_CMSE_SECURE_MODE)
|
|
||||||
unsigned sau_region_valid : 1;
|
|
||||||
#else
|
|
||||||
unsigned : 1;
|
|
||||||
#endif
|
|
||||||
unsigned read_ok : 1;
|
|
||||||
unsigned readwrite_ok : 1;
|
|
||||||
#if (__ARM_CMSE_SECURE_MODE)
|
|
||||||
unsigned nonsecure_read_ok : 1;
|
|
||||||
unsigned nonsecure_readwrite_ok : 1;
|
|
||||||
unsigned secure : 1;
|
|
||||||
unsigned idau_region_valid : 1;
|
|
||||||
unsigned idau_region : 8;
|
|
||||||
#else
|
|
||||||
unsigned : 12;
|
|
||||||
#endif
|
|
||||||
#endif /*__ARM_LITTLE_ENDIAN */
|
|
||||||
} flags;
|
|
||||||
unsigned value;
|
|
||||||
} cmse_address_info_t;
|
|
||||||
|
|
||||||
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
cmse_TT(void *__p) {
|
|
||||||
cmse_address_info_t __u;
|
|
||||||
__u.value = __builtin_arm_cmse_TT(__p);
|
|
||||||
return __u;
|
|
||||||
}
|
|
||||||
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
cmse_TTT(void *__p) {
|
|
||||||
cmse_address_info_t __u;
|
|
||||||
__u.value = __builtin_arm_cmse_TTT(__p);
|
|
||||||
return __u;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if __ARM_CMSE_SECURE_MODE
|
|
||||||
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
cmse_TTA(void *__p) {
|
|
||||||
cmse_address_info_t __u;
|
|
||||||
__u.value = __builtin_arm_cmse_TTA(__p);
|
|
||||||
return __u;
|
|
||||||
}
|
|
||||||
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
cmse_TTAT(void *__p) {
|
|
||||||
cmse_address_info_t __u;
|
|
||||||
__u.value = __builtin_arm_cmse_TTAT(__p);
|
|
||||||
return __u;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define cmse_TT_fptr(p) cmse_TT(__builtin_bit_cast(void *, (p)))
|
|
||||||
#define cmse_TTT_fptr(p) cmse_TTT(__builtin_bit_cast(void *, (p)))
|
|
||||||
|
|
||||||
#if __ARM_CMSE_SECURE_MODE
|
|
||||||
#define cmse_TTA_fptr(p) cmse_TTA(__builtin_bit_cast(void *, (p)))
|
|
||||||
#define cmse_TTAT_fptr(p) cmse_TTAT(__builtin_bit_cast(void *, (p)))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void *__attribute__((__always_inline__))
|
|
||||||
cmse_check_address_range(void *__pb, size_t __s, int __flags) {
|
|
||||||
uintptr_t __begin = (uintptr_t)__pb;
|
|
||||||
uintptr_t __end = __begin + __s - 1;
|
|
||||||
|
|
||||||
if (__end < __begin)
|
|
||||||
return NULL; /* wrap around check */
|
|
||||||
|
|
||||||
/* Check whether the range crosses a 32-bytes aligned address */
|
|
||||||
const int __single_check = (__begin ^ __end) < 0x20u;
|
|
||||||
|
|
||||||
/* execute the right variant of the TT instructions */
|
|
||||||
void *__pe = (void *)__end;
|
|
||||||
cmse_address_info_t __permb, __perme;
|
|
||||||
switch (__flags & (CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
|
|
||||||
case 0:
|
|
||||||
__permb = cmse_TT(__pb);
|
|
||||||
__perme = __single_check ? __permb : cmse_TT(__pe);
|
|
||||||
break;
|
|
||||||
case CMSE_MPU_UNPRIV:
|
|
||||||
__permb = cmse_TTT(__pb);
|
|
||||||
__perme = __single_check ? __permb : cmse_TTT(__pe);
|
|
||||||
break;
|
|
||||||
#if __ARM_CMSE_SECURE_MODE
|
|
||||||
case CMSE_MPU_NONSECURE:
|
|
||||||
__permb = cmse_TTA(__pb);
|
|
||||||
__perme = __single_check ? __permb : cmse_TTA(__pe);
|
|
||||||
break;
|
|
||||||
case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
|
|
||||||
__permb = cmse_TTAT(__pb);
|
|
||||||
__perme = __single_check ? __permb : cmse_TTAT(__pe);
|
|
||||||
break;
|
|
||||||
#endif
|
|
||||||
/* if CMSE_NONSECURE is specified w/o __ARM_CMSE_SECURE_MODE */
|
|
||||||
default:
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check that the range does not cross MPU, SAU, or IDAU region boundaries */
|
|
||||||
if (__permb.value != __perme.value)
|
|
||||||
return NULL;
|
|
||||||
#if !(__ARM_CMSE_SECURE_MODE)
|
|
||||||
/* CMSE_AU_NONSECURE is only supported when __ARM_FEATURE_CMSE & 0x2 */
|
|
||||||
if (__flags & CMSE_AU_NONSECURE)
|
|
||||||
return NULL;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* check the permission on the range */
|
|
||||||
switch (__flags & ~(CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
|
|
||||||
#if (__ARM_CMSE_SECURE_MODE)
|
|
||||||
case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
|
|
||||||
case CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
|
|
||||||
return __permb.flags.nonsecure_readwrite_ok ? __pb : NULL;
|
|
||||||
|
|
||||||
case CMSE_MPU_READ | CMSE_AU_NONSECURE:
|
|
||||||
return __permb.flags.nonsecure_read_ok ? __pb : NULL;
|
|
||||||
|
|
||||||
case CMSE_AU_NONSECURE:
|
|
||||||
return __permb.flags.secure ? NULL : __pb;
|
|
||||||
#endif
|
|
||||||
case CMSE_MPU_READ | CMSE_MPU_READWRITE:
|
|
||||||
case CMSE_MPU_READWRITE:
|
|
||||||
return __permb.flags.readwrite_ok ? __pb : NULL;
|
|
||||||
|
|
||||||
case CMSE_MPU_READ:
|
|
||||||
return __permb.flags.read_ok ? __pb : NULL;
|
|
||||||
|
|
||||||
default:
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if __ARM_CMSE_SECURE_MODE
|
|
||||||
static int __attribute__((__always_inline__, __nodebug__))
|
|
||||||
cmse_nonsecure_caller(void) {
|
|
||||||
return !((uintptr_t)__builtin_return_address(0) & 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define cmse_nsfptr_create(p) \
|
|
||||||
__builtin_bit_cast(__typeof__(p), \
|
|
||||||
(__builtin_bit_cast(uintptr_t, p) & ~(uintptr_t)1))
|
|
||||||
|
|
||||||
#define cmse_is_nsfptr(p) ((__builtin_bit_cast(uintptr_t, p) & 1) == 0)
|
|
||||||
|
|
||||||
#endif /* __ARM_CMSE_SECURE_MODE */
|
|
||||||
|
|
||||||
void __attribute__((__noreturn__)) cmse_abort(void);
|
|
||||||
#if defined(__cplusplus)
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* (__ARM_FEATURE_CMSE & 0x1) */
|
|
||||||
|
|
||||||
#endif /* __ARM_CMSE_H */
|
|
||||||
@@ -1,596 +0,0 @@
|
|||||||
/*===---- arm_fp16.h - ARM FP16 intrinsics ---------------------------------===
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in
|
|
||||||
* all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
||||||
* THE SOFTWARE.
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __ARM_FP16_H
|
|
||||||
#define __ARM_FP16_H
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
typedef __fp16 float16_t;
|
|
||||||
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
|
|
||||||
|
|
||||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
|
||||||
#define vabdh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vabdh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vabsh_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vabsh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vaddh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vaddh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcageh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcageh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcagth_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcagth_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcaleh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcaleh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcalth_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcalth_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vceqh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vceqh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vceqzh_f16(__p0) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vceqzh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcgeh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcgeh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcgezh_f16(__p0) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcgezh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcgth_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcgth_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcgtzh_f16(__p0) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcgtzh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcleh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcleh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vclezh_f16(__p0) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vclezh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vclth_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vclth_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcltzh_f16(__p0) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcltzh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
int16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int16_t, __builtin_neon_vcvth_n_s16_f16(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
int32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int32_t, __builtin_neon_vcvth_n_s32_f16(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
int64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int64_t, __builtin_neon_vcvth_n_s64_f16(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcvth_n_u16_f16(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint32_t, __builtin_neon_vcvth_n_u32_f16(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
uint64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint64_t, __builtin_neon_vcvth_n_u64_f16(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_s16_f16(__p0) __extension__ ({ \
|
|
||||||
int16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int16_t, __builtin_neon_vcvth_s16_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_s32_f16(__p0) __extension__ ({ \
|
|
||||||
int32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int32_t, __builtin_neon_vcvth_s32_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_s64_f16(__p0) __extension__ ({ \
|
|
||||||
int64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int64_t, __builtin_neon_vcvth_s64_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_u16_f16(__p0) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcvth_u16_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_u32_f16(__p0) __extension__ ({ \
|
|
||||||
uint32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint32_t, __builtin_neon_vcvth_u32_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_u64_f16(__p0) __extension__ ({ \
|
|
||||||
uint64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint64_t, __builtin_neon_vcvth_u64_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtah_s16_f16(__p0) __extension__ ({ \
|
|
||||||
int16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int16_t, __builtin_neon_vcvtah_s16_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtah_s32_f16(__p0) __extension__ ({ \
|
|
||||||
int32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int32_t, __builtin_neon_vcvtah_s32_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtah_s64_f16(__p0) __extension__ ({ \
|
|
||||||
int64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int64_t, __builtin_neon_vcvtah_s64_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtah_u16_f16(__p0) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcvtah_u16_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtah_u32_f16(__p0) __extension__ ({ \
|
|
||||||
uint32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint32_t, __builtin_neon_vcvtah_u32_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtah_u64_f16(__p0) __extension__ ({ \
|
|
||||||
uint64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint64_t, __builtin_neon_vcvtah_u64_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_f16_u16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
uint16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_f16_u16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_f16_s16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
int16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_f16_s16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_f16_u32(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
uint32_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_f16_u32(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_f16_s32(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
int32_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_f16_s32(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_f16_u64(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
uint64_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_f16_u64(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_f16_s64(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
int64_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_f16_s64(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
uint32_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_n_f16_u32(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
int32_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_n_f16_s32(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
uint64_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_n_f16_u64(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
int64_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_n_f16_s64(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
uint16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_n_f16_u16(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
int16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vcvth_n_f16_s16(__s0, __p1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtmh_s16_f16(__p0) __extension__ ({ \
|
|
||||||
int16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int16_t, __builtin_neon_vcvtmh_s16_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtmh_s32_f16(__p0) __extension__ ({ \
|
|
||||||
int32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int32_t, __builtin_neon_vcvtmh_s32_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtmh_s64_f16(__p0) __extension__ ({ \
|
|
||||||
int64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int64_t, __builtin_neon_vcvtmh_s64_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtmh_u16_f16(__p0) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcvtmh_u16_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtmh_u32_f16(__p0) __extension__ ({ \
|
|
||||||
uint32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint32_t, __builtin_neon_vcvtmh_u32_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtmh_u64_f16(__p0) __extension__ ({ \
|
|
||||||
uint64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint64_t, __builtin_neon_vcvtmh_u64_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtnh_s16_f16(__p0) __extension__ ({ \
|
|
||||||
int16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int16_t, __builtin_neon_vcvtnh_s16_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtnh_s32_f16(__p0) __extension__ ({ \
|
|
||||||
int32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int32_t, __builtin_neon_vcvtnh_s32_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtnh_s64_f16(__p0) __extension__ ({ \
|
|
||||||
int64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int64_t, __builtin_neon_vcvtnh_s64_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtnh_u16_f16(__p0) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcvtnh_u16_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtnh_u32_f16(__p0) __extension__ ({ \
|
|
||||||
uint32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint32_t, __builtin_neon_vcvtnh_u32_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtnh_u64_f16(__p0) __extension__ ({ \
|
|
||||||
uint64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint64_t, __builtin_neon_vcvtnh_u64_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtph_s16_f16(__p0) __extension__ ({ \
|
|
||||||
int16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int16_t, __builtin_neon_vcvtph_s16_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtph_s32_f16(__p0) __extension__ ({ \
|
|
||||||
int32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int32_t, __builtin_neon_vcvtph_s32_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtph_s64_f16(__p0) __extension__ ({ \
|
|
||||||
int64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(int64_t, __builtin_neon_vcvtph_s64_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtph_u16_f16(__p0) __extension__ ({ \
|
|
||||||
uint16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint16_t, __builtin_neon_vcvtph_u16_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtph_u32_f16(__p0) __extension__ ({ \
|
|
||||||
uint32_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint32_t, __builtin_neon_vcvtph_u32_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vcvtph_u64_f16(__p0) __extension__ ({ \
|
|
||||||
uint64_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(uint64_t, __builtin_neon_vcvtph_u64_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vdivh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vdivh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
float16_t __s2 = __p2; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vfmah_f16(__s0, __s1, __s2)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
float16_t __s2 = __p2; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vfmsh_f16(__s0, __s1, __s2)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vmaxh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vmaxh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vmaxnmh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vmaxnmh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vminh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vminh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vminnmh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vminnmh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vmulh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vmulh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vmulxh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vmulxh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vnegh_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vnegh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrecpeh_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrecpeh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrecpsh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrecpsh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrecpxh_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrecpxh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrndh_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrndh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrndah_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrndah_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrndih_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrndih_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrndmh_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrndmh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrndnh_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrndnh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrndph_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrndph_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrndxh_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrndxh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrsqrteh_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrsqrteh_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vrsqrtsh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vsqrth_f16(__p0) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vsqrth_f16(__s0)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#define vsubh_f16(__p0, __p1) __extension__ ({ \
|
|
||||||
float16_t __ret; \
|
|
||||||
float16_t __s0 = __p0; \
|
|
||||||
float16_t __s1 = __p1; \
|
|
||||||
__ret = __builtin_bit_cast(float16_t, __builtin_neon_vsubh_f16(__s0, __s1)); \
|
|
||||||
__ret; \
|
|
||||||
})
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef __ai
|
|
||||||
|
|
||||||
#endif /* __ARM_FP16_H */
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,182 +0,0 @@
|
|||||||
/*===---- arm_neon_sve_bridge.h - ARM NEON SVE Bridge intrinsics -----------===
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef __ARM_NEON_SVE_BRIDGE_H
|
|
||||||
#define __ARM_NEON_SVE_BRIDGE_H
|
|
||||||
|
|
||||||
#include <arm_neon.h>
|
|
||||||
#include <arm_sve.h>
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Function attributes */
|
|
||||||
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
|
|
||||||
#define __aio \
|
|
||||||
static __inline__ \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __overloadable__))
|
|
||||||
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
|
|
||||||
svint8_t svset_neonq(svint8_t, int8x16_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
|
|
||||||
svint16_t svset_neonq(svint16_t, int16x8_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
|
|
||||||
svint32_t svset_neonq(svint32_t, int32x4_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
|
|
||||||
svint64_t svset_neonq(svint64_t, int64x2_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
|
|
||||||
svuint8_t svset_neonq(svuint8_t, uint8x16_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
|
|
||||||
svuint16_t svset_neonq(svuint16_t, uint16x8_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
|
|
||||||
svuint32_t svset_neonq(svuint32_t, uint32x4_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
|
|
||||||
svuint64_t svset_neonq(svuint64_t, uint64x2_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
|
|
||||||
svfloat16_t svset_neonq(svfloat16_t, float16x8_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
|
|
||||||
svfloat32_t svset_neonq(svfloat32_t, float32x4_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
|
|
||||||
svfloat64_t svset_neonq(svfloat64_t, float64x2_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
|
|
||||||
svint8_t svset_neonq_s8(svint8_t, int8x16_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
|
|
||||||
svint16_t svset_neonq_s16(svint16_t, int16x8_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
|
|
||||||
svint32_t svset_neonq_s32(svint32_t, int32x4_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
|
|
||||||
svint64_t svset_neonq_s64(svint64_t, int64x2_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
|
|
||||||
svuint8_t svset_neonq_u8(svuint8_t, uint8x16_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
|
|
||||||
svuint16_t svset_neonq_u16(svuint16_t, uint16x8_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
|
|
||||||
svuint32_t svset_neonq_u32(svuint32_t, uint32x4_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
|
|
||||||
svuint64_t svset_neonq_u64(svuint64_t, uint64x2_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
|
|
||||||
svfloat16_t svset_neonq_f16(svfloat16_t, float16x8_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
|
|
||||||
svfloat32_t svset_neonq_f32(svfloat32_t, float32x4_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
|
|
||||||
svfloat64_t svset_neonq_f64(svfloat64_t, float64x2_t);
|
|
||||||
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
|
|
||||||
int8x16_t svget_neonq(svint8_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
|
|
||||||
int16x8_t svget_neonq(svint16_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
|
|
||||||
int32x4_t svget_neonq(svint32_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
|
|
||||||
int64x2_t svget_neonq(svint64_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
|
|
||||||
uint8x16_t svget_neonq(svuint8_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
|
|
||||||
uint16x8_t svget_neonq(svuint16_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
|
|
||||||
uint32x4_t svget_neonq(svuint32_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
|
|
||||||
uint64x2_t svget_neonq(svuint64_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
|
|
||||||
float16x8_t svget_neonq(svfloat16_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
|
|
||||||
float32x4_t svget_neonq(svfloat32_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
|
|
||||||
float64x2_t svget_neonq(svfloat64_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
|
|
||||||
int8x16_t svget_neonq_s8(svint8_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
|
|
||||||
int16x8_t svget_neonq_s16(svint16_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
|
|
||||||
int32x4_t svget_neonq_s32(svint32_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
|
|
||||||
int64x2_t svget_neonq_s64(svint64_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
|
|
||||||
uint8x16_t svget_neonq_u8(svuint8_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
|
|
||||||
uint16x8_t svget_neonq_u16(svuint16_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
|
|
||||||
uint32x4_t svget_neonq_u32(svuint32_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
|
|
||||||
uint64x2_t svget_neonq_u64(svuint64_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
|
|
||||||
float16x8_t svget_neonq_f16(svfloat16_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
|
|
||||||
float32x4_t svget_neonq_f32(svfloat32_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
|
|
||||||
float64x2_t svget_neonq_f64(svfloat64_t);
|
|
||||||
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
|
|
||||||
svint8_t svdup_neonq(int8x16_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
|
|
||||||
svint16_t svdup_neonq(int16x8_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
|
|
||||||
svint32_t svdup_neonq(int32x4_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
|
|
||||||
svint64_t svdup_neonq(int64x2_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
|
|
||||||
svuint8_t svdup_neonq(uint8x16_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
|
|
||||||
svuint16_t svdup_neonq(uint16x8_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
|
|
||||||
svuint32_t svdup_neonq(uint32x4_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
|
|
||||||
svuint64_t svdup_neonq(uint64x2_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
|
|
||||||
svfloat16_t svdup_neonq(float16x8_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
|
|
||||||
svfloat32_t svdup_neonq(float32x4_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
|
|
||||||
svfloat64_t svdup_neonq(float64x2_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
|
|
||||||
svint8_t svdup_neonq_s8(int8x16_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
|
|
||||||
svint16_t svdup_neonq_s16(int16x8_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
|
|
||||||
svint32_t svdup_neonq_s32(int32x4_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
|
|
||||||
svint64_t svdup_neonq_s64(int64x2_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
|
|
||||||
svuint8_t svdup_neonq_u8(uint8x16_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
|
|
||||||
svuint16_t svdup_neonq_u16(uint16x8_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
|
|
||||||
svuint32_t svdup_neonq_u32(uint32x4_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
|
|
||||||
svuint64_t svdup_neonq_u64(uint64x2_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
|
|
||||||
svfloat16_t svdup_neonq_f16(float16x8_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
|
|
||||||
svfloat32_t svdup_neonq_f32(float32x4_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
|
|
||||||
svfloat64_t svdup_neonq_f64(float64x2_t);
|
|
||||||
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
|
|
||||||
svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
|
|
||||||
svbfloat16_t svset_neonq_bf16(svbfloat16_t, bfloat16x8_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
|
|
||||||
bfloat16x8_t svget_neonq(svbfloat16_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
|
|
||||||
bfloat16x8_t svget_neonq_bf16(svbfloat16_t);
|
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
|
|
||||||
svbfloat16_t svdup_neonq(bfloat16x8_t);
|
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
|
|
||||||
svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
|
|
||||||
|
|
||||||
#undef __ai
|
|
||||||
#undef __aio
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
} // extern "C"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif //__ARM_NEON_SVE_BRIDGE_H
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,432 +0,0 @@
|
|||||||
/*===---- arm_vector_types - ARM vector type ------===
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
#if !defined(__ARM_NEON_H) && !defined(__ARM_SVE_H)
|
|
||||||
#error "This file should not be used standalone. Please include arm_neon.h or arm_sve.h instead"
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#ifndef __ARM_NEON_TYPES_H
|
|
||||||
#define __ARM_NEON_TYPES_H
|
|
||||||
typedef float float32_t;
|
|
||||||
typedef __fp16 float16_t;
|
|
||||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
|
||||||
typedef __mfp8 mfloat8_t;
|
|
||||||
typedef double float64_t;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
typedef uint64_t fpm_t;
|
|
||||||
|
|
||||||
enum __ARM_FPM_FORMAT { __ARM_FPM_E5M2, __ARM_FPM_E4M3 };
|
|
||||||
|
|
||||||
enum __ARM_FPM_OVERFLOW { __ARM_FPM_INFNAN, __ARM_FPM_SATURATE };
|
|
||||||
|
|
||||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__arm_fpm_init(void) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__arm_set_fpm_src1_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
|
|
||||||
return (__fpm & ~7ull) | (fpm_t)__format;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__arm_set_fpm_src2_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
|
|
||||||
return (__fpm & ~0x38ull) | ((fpm_t)__format << 3u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__arm_set_fpm_dst_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
|
|
||||||
return (__fpm & ~0x1c0ull) | ((fpm_t)__format << 6u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__arm_set_fpm_overflow_mul(fpm_t __fpm, enum __ARM_FPM_OVERFLOW __behaviour) {
|
|
||||||
return (__fpm & ~0x4000ull) | ((fpm_t)__behaviour << 14u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__arm_set_fpm_overflow_cvt(fpm_t __fpm, enum __ARM_FPM_OVERFLOW __behaviour) {
|
|
||||||
return (__fpm & ~0x8000ull) | ((fpm_t)__behaviour << 15u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__arm_set_fpm_lscale(fpm_t __fpm, uint64_t __scale) {
|
|
||||||
return (__fpm & ~0x7f0000ull) | (__scale << 16u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__arm_set_fpm_nscale(fpm_t __fpm, int64_t __scale) {
|
|
||||||
return (__fpm & ~0xff000000ull) | (((fpm_t)__scale & 0xffu) << 24u);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
|
||||||
__arm_set_fpm_lscale2(fpm_t __fpm, uint64_t __scale) {
|
|
||||||
return (uint32_t)__fpm | (__scale << 32u);
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
|
|
||||||
typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
|
|
||||||
typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
|
|
||||||
typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
|
|
||||||
typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
|
|
||||||
typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
|
|
||||||
typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
|
|
||||||
typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
|
|
||||||
typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
|
|
||||||
typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
|
|
||||||
typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
|
|
||||||
typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
|
|
||||||
typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
|
|
||||||
typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
|
|
||||||
typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
|
|
||||||
typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
|
|
||||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
|
||||||
typedef __attribute__((neon_vector_type(8))) mfloat8_t mfloat8x8_t;
|
|
||||||
typedef __attribute__((neon_vector_type(16))) mfloat8_t mfloat8x16_t;
|
|
||||||
#endif
|
|
||||||
typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
|
|
||||||
typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
|
|
||||||
typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
|
|
||||||
typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
|
|
||||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
|
||||||
typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
|
|
||||||
typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef struct int8x8x2_t {
|
|
||||||
int8x8_t val[2];
|
|
||||||
} int8x8x2_t;
|
|
||||||
|
|
||||||
typedef struct int8x16x2_t {
|
|
||||||
int8x16_t val[2];
|
|
||||||
} int8x16x2_t;
|
|
||||||
|
|
||||||
typedef struct int16x4x2_t {
|
|
||||||
int16x4_t val[2];
|
|
||||||
} int16x4x2_t;
|
|
||||||
|
|
||||||
typedef struct int16x8x2_t {
|
|
||||||
int16x8_t val[2];
|
|
||||||
} int16x8x2_t;
|
|
||||||
|
|
||||||
typedef struct int32x2x2_t {
|
|
||||||
int32x2_t val[2];
|
|
||||||
} int32x2x2_t;
|
|
||||||
|
|
||||||
typedef struct int32x4x2_t {
|
|
||||||
int32x4_t val[2];
|
|
||||||
} int32x4x2_t;
|
|
||||||
|
|
||||||
typedef struct int64x1x2_t {
|
|
||||||
int64x1_t val[2];
|
|
||||||
} int64x1x2_t;
|
|
||||||
|
|
||||||
typedef struct int64x2x2_t {
|
|
||||||
int64x2_t val[2];
|
|
||||||
} int64x2x2_t;
|
|
||||||
|
|
||||||
typedef struct uint8x8x2_t {
|
|
||||||
uint8x8_t val[2];
|
|
||||||
} uint8x8x2_t;
|
|
||||||
|
|
||||||
typedef struct uint8x16x2_t {
|
|
||||||
uint8x16_t val[2];
|
|
||||||
} uint8x16x2_t;
|
|
||||||
|
|
||||||
typedef struct uint16x4x2_t {
|
|
||||||
uint16x4_t val[2];
|
|
||||||
} uint16x4x2_t;
|
|
||||||
|
|
||||||
typedef struct uint16x8x2_t {
|
|
||||||
uint16x8_t val[2];
|
|
||||||
} uint16x8x2_t;
|
|
||||||
|
|
||||||
typedef struct uint32x2x2_t {
|
|
||||||
uint32x2_t val[2];
|
|
||||||
} uint32x2x2_t;
|
|
||||||
|
|
||||||
typedef struct uint32x4x2_t {
|
|
||||||
uint32x4_t val[2];
|
|
||||||
} uint32x4x2_t;
|
|
||||||
|
|
||||||
typedef struct uint64x1x2_t {
|
|
||||||
uint64x1_t val[2];
|
|
||||||
} uint64x1x2_t;
|
|
||||||
|
|
||||||
typedef struct uint64x2x2_t {
|
|
||||||
uint64x2_t val[2];
|
|
||||||
} uint64x2x2_t;
|
|
||||||
|
|
||||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
|
||||||
typedef struct mfloat8x8x2_t {
|
|
||||||
mfloat8x8_t val[2];
|
|
||||||
} mfloat8x8x2_t;
|
|
||||||
|
|
||||||
typedef struct mfloat8x16x2_t {
|
|
||||||
mfloat8x16_t val[2];
|
|
||||||
} mfloat8x16x2_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
typedef struct float16x4x2_t {
|
|
||||||
float16x4_t val[2];
|
|
||||||
} float16x4x2_t;
|
|
||||||
|
|
||||||
typedef struct float16x8x2_t {
|
|
||||||
float16x8_t val[2];
|
|
||||||
} float16x8x2_t;
|
|
||||||
|
|
||||||
typedef struct float32x2x2_t {
|
|
||||||
float32x2_t val[2];
|
|
||||||
} float32x2x2_t;
|
|
||||||
|
|
||||||
typedef struct float32x4x2_t {
|
|
||||||
float32x4_t val[2];
|
|
||||||
} float32x4x2_t;
|
|
||||||
|
|
||||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
|
||||||
typedef struct float64x1x2_t {
|
|
||||||
float64x1_t val[2];
|
|
||||||
} float64x1x2_t;
|
|
||||||
|
|
||||||
typedef struct float64x2x2_t {
|
|
||||||
float64x2_t val[2];
|
|
||||||
} float64x2x2_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
typedef struct int8x8x3_t {
|
|
||||||
int8x8_t val[3];
|
|
||||||
} int8x8x3_t;
|
|
||||||
|
|
||||||
typedef struct int8x16x3_t {
|
|
||||||
int8x16_t val[3];
|
|
||||||
} int8x16x3_t;
|
|
||||||
|
|
||||||
typedef struct int16x4x3_t {
|
|
||||||
int16x4_t val[3];
|
|
||||||
} int16x4x3_t;
|
|
||||||
|
|
||||||
typedef struct int16x8x3_t {
|
|
||||||
int16x8_t val[3];
|
|
||||||
} int16x8x3_t;
|
|
||||||
|
|
||||||
typedef struct int32x2x3_t {
|
|
||||||
int32x2_t val[3];
|
|
||||||
} int32x2x3_t;
|
|
||||||
|
|
||||||
typedef struct int32x4x3_t {
|
|
||||||
int32x4_t val[3];
|
|
||||||
} int32x4x3_t;
|
|
||||||
|
|
||||||
typedef struct int64x1x3_t {
|
|
||||||
int64x1_t val[3];
|
|
||||||
} int64x1x3_t;
|
|
||||||
|
|
||||||
typedef struct int64x2x3_t {
|
|
||||||
int64x2_t val[3];
|
|
||||||
} int64x2x3_t;
|
|
||||||
|
|
||||||
typedef struct uint8x8x3_t {
|
|
||||||
uint8x8_t val[3];
|
|
||||||
} uint8x8x3_t;
|
|
||||||
|
|
||||||
typedef struct uint8x16x3_t {
|
|
||||||
uint8x16_t val[3];
|
|
||||||
} uint8x16x3_t;
|
|
||||||
|
|
||||||
typedef struct uint16x4x3_t {
|
|
||||||
uint16x4_t val[3];
|
|
||||||
} uint16x4x3_t;
|
|
||||||
|
|
||||||
typedef struct uint16x8x3_t {
|
|
||||||
uint16x8_t val[3];
|
|
||||||
} uint16x8x3_t;
|
|
||||||
|
|
||||||
typedef struct uint32x2x3_t {
|
|
||||||
uint32x2_t val[3];
|
|
||||||
} uint32x2x3_t;
|
|
||||||
|
|
||||||
typedef struct uint32x4x3_t {
|
|
||||||
uint32x4_t val[3];
|
|
||||||
} uint32x4x3_t;
|
|
||||||
|
|
||||||
typedef struct uint64x1x3_t {
|
|
||||||
uint64x1_t val[3];
|
|
||||||
} uint64x1x3_t;
|
|
||||||
|
|
||||||
typedef struct uint64x2x3_t {
|
|
||||||
uint64x2_t val[3];
|
|
||||||
} uint64x2x3_t;
|
|
||||||
|
|
||||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
|
||||||
typedef struct mfloat8x8x3_t {
|
|
||||||
mfloat8x8_t val[3];
|
|
||||||
} mfloat8x8x3_t;
|
|
||||||
|
|
||||||
typedef struct mfloat8x16x3_t {
|
|
||||||
mfloat8x16_t val[3];
|
|
||||||
} mfloat8x16x3_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
typedef struct float16x4x3_t {
|
|
||||||
float16x4_t val[3];
|
|
||||||
} float16x4x3_t;
|
|
||||||
|
|
||||||
typedef struct float16x8x3_t {
|
|
||||||
float16x8_t val[3];
|
|
||||||
} float16x8x3_t;
|
|
||||||
|
|
||||||
typedef struct float32x2x3_t {
|
|
||||||
float32x2_t val[3];
|
|
||||||
} float32x2x3_t;
|
|
||||||
|
|
||||||
typedef struct float32x4x3_t {
|
|
||||||
float32x4_t val[3];
|
|
||||||
} float32x4x3_t;
|
|
||||||
|
|
||||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
|
||||||
typedef struct float64x1x3_t {
|
|
||||||
float64x1_t val[3];
|
|
||||||
} float64x1x3_t;
|
|
||||||
|
|
||||||
typedef struct float64x2x3_t {
|
|
||||||
float64x2_t val[3];
|
|
||||||
} float64x2x3_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
typedef struct int8x8x4_t {
|
|
||||||
int8x8_t val[4];
|
|
||||||
} int8x8x4_t;
|
|
||||||
|
|
||||||
typedef struct int8x16x4_t {
|
|
||||||
int8x16_t val[4];
|
|
||||||
} int8x16x4_t;
|
|
||||||
|
|
||||||
typedef struct int16x4x4_t {
|
|
||||||
int16x4_t val[4];
|
|
||||||
} int16x4x4_t;
|
|
||||||
|
|
||||||
typedef struct int16x8x4_t {
|
|
||||||
int16x8_t val[4];
|
|
||||||
} int16x8x4_t;
|
|
||||||
|
|
||||||
typedef struct int32x2x4_t {
|
|
||||||
int32x2_t val[4];
|
|
||||||
} int32x2x4_t;
|
|
||||||
|
|
||||||
typedef struct int32x4x4_t {
|
|
||||||
int32x4_t val[4];
|
|
||||||
} int32x4x4_t;
|
|
||||||
|
|
||||||
typedef struct int64x1x4_t {
|
|
||||||
int64x1_t val[4];
|
|
||||||
} int64x1x4_t;
|
|
||||||
|
|
||||||
typedef struct int64x2x4_t {
|
|
||||||
int64x2_t val[4];
|
|
||||||
} int64x2x4_t;
|
|
||||||
|
|
||||||
typedef struct uint8x8x4_t {
|
|
||||||
uint8x8_t val[4];
|
|
||||||
} uint8x8x4_t;
|
|
||||||
|
|
||||||
typedef struct uint8x16x4_t {
|
|
||||||
uint8x16_t val[4];
|
|
||||||
} uint8x16x4_t;
|
|
||||||
|
|
||||||
typedef struct uint16x4x4_t {
|
|
||||||
uint16x4_t val[4];
|
|
||||||
} uint16x4x4_t;
|
|
||||||
|
|
||||||
typedef struct uint16x8x4_t {
|
|
||||||
uint16x8_t val[4];
|
|
||||||
} uint16x8x4_t;
|
|
||||||
|
|
||||||
typedef struct uint32x2x4_t {
|
|
||||||
uint32x2_t val[4];
|
|
||||||
} uint32x2x4_t;
|
|
||||||
|
|
||||||
typedef struct uint32x4x4_t {
|
|
||||||
uint32x4_t val[4];
|
|
||||||
} uint32x4x4_t;
|
|
||||||
|
|
||||||
typedef struct uint64x1x4_t {
|
|
||||||
uint64x1_t val[4];
|
|
||||||
} uint64x1x4_t;
|
|
||||||
|
|
||||||
typedef struct uint64x2x4_t {
|
|
||||||
uint64x2_t val[4];
|
|
||||||
} uint64x2x4_t;
|
|
||||||
|
|
||||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
|
||||||
typedef struct mfloat8x8x4_t {
|
|
||||||
mfloat8x8_t val[4];
|
|
||||||
} mfloat8x8x4_t;
|
|
||||||
|
|
||||||
typedef struct mfloat8x16x4_t {
|
|
||||||
mfloat8x16_t val[4];
|
|
||||||
} mfloat8x16x4_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
typedef struct float16x4x4_t {
|
|
||||||
float16x4_t val[4];
|
|
||||||
} float16x4x4_t;
|
|
||||||
|
|
||||||
typedef struct float16x8x4_t {
|
|
||||||
float16x8_t val[4];
|
|
||||||
} float16x8x4_t;
|
|
||||||
|
|
||||||
typedef struct float32x2x4_t {
|
|
||||||
float32x2_t val[4];
|
|
||||||
} float32x2x4_t;
|
|
||||||
|
|
||||||
typedef struct float32x4x4_t {
|
|
||||||
float32x4_t val[4];
|
|
||||||
} float32x4x4_t;
|
|
||||||
|
|
||||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
|
||||||
typedef struct float64x1x4_t {
|
|
||||||
float64x1_t val[4];
|
|
||||||
} float64x1x4_t;
|
|
||||||
|
|
||||||
typedef struct float64x2x4_t {
|
|
||||||
float64x2_t val[4];
|
|
||||||
} float64x2x4_t;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
typedef __attribute__((neon_vector_type(4))) bfloat16_t bfloat16x4_t;
|
|
||||||
typedef __attribute__((neon_vector_type(8))) bfloat16_t bfloat16x8_t;
|
|
||||||
|
|
||||||
typedef struct bfloat16x4x2_t {
|
|
||||||
bfloat16x4_t val[2];
|
|
||||||
} bfloat16x4x2_t;
|
|
||||||
|
|
||||||
typedef struct bfloat16x8x2_t {
|
|
||||||
bfloat16x8_t val[2];
|
|
||||||
} bfloat16x8x2_t;
|
|
||||||
|
|
||||||
typedef struct bfloat16x4x3_t {
|
|
||||||
bfloat16x4_t val[3];
|
|
||||||
} bfloat16x4x3_t;
|
|
||||||
|
|
||||||
typedef struct bfloat16x8x3_t {
|
|
||||||
bfloat16x8_t val[3];
|
|
||||||
} bfloat16x8x3_t;
|
|
||||||
|
|
||||||
typedef struct bfloat16x4x4_t {
|
|
||||||
bfloat16x4_t val[4];
|
|
||||||
} bfloat16x4x4_t;
|
|
||||||
|
|
||||||
typedef struct bfloat16x8x4_t {
|
|
||||||
bfloat16x8_t val[4];
|
|
||||||
} bfloat16x8x4_t;
|
|
||||||
|
|
||||||
#endif // __ARM_NEON_TYPES_H
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Only include this if we're compiling for the windows platform. */
|
|
||||||
#ifndef _MSC_VER
|
|
||||||
#include_next <armintr.h>
|
|
||||||
#else
|
|
||||||
|
|
||||||
#ifndef __ARMINTR_H
|
|
||||||
#define __ARMINTR_H
|
|
||||||
|
|
||||||
typedef enum
|
|
||||||
{
|
|
||||||
_ARM_BARRIER_SY = 0xF,
|
|
||||||
_ARM_BARRIER_ST = 0xE,
|
|
||||||
_ARM_BARRIER_ISH = 0xB,
|
|
||||||
_ARM_BARRIER_ISHST = 0xA,
|
|
||||||
_ARM_BARRIER_NSH = 0x7,
|
|
||||||
_ARM_BARRIER_NSHST = 0x6,
|
|
||||||
_ARM_BARRIER_OSH = 0x3,
|
|
||||||
_ARM_BARRIER_OSHST = 0x2
|
|
||||||
} _ARMINTR_BARRIER_TYPE;
|
|
||||||
|
|
||||||
#endif /* __ARMINTR_H */
|
|
||||||
#endif /* _MSC_VER */
|
|
||||||
@@ -1,561 +0,0 @@
|
|||||||
/*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __SSE2__
|
|
||||||
|
|
||||||
#ifndef __AVX10_2_512BF16INTRIN_H
|
|
||||||
#define __AVX10_2_512BF16INTRIN_H
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#define __DEFAULT_FN_ATTRS512 \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
|
|
||||||
__min_vector_width__(512)))
|
|
||||||
|
|
||||||
static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
|
|
||||||
return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) {
|
|
||||||
return (__m512bh)__builtin_ia32_undef512();
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) {
|
|
||||||
return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
|
|
||||||
bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
|
|
||||||
bf, bf, bf, bf, bf, bf, bf, bf, bf, bf};
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh(
|
|
||||||
__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
|
|
||||||
__bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
|
|
||||||
__bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17,
|
|
||||||
__bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22,
|
|
||||||
__bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27,
|
|
||||||
__bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
|
|
||||||
return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25,
|
|
||||||
bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17,
|
|
||||||
bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9,
|
|
||||||
bf8, bf7, bf6, bf5, bf4, bf3, bf2, bf1};
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \
|
|
||||||
bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19, \
|
|
||||||
bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28, \
|
|
||||||
bf29, bf30, bf31, bf32) \
|
|
||||||
_mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26), \
|
|
||||||
(bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19), \
|
|
||||||
(bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12), \
|
|
||||||
(bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), \
|
|
||||||
(bf3), (bf2), (bf1))
|
|
||||||
|
|
||||||
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_castbf16_ps(__m512bh __a) {
|
|
||||||
return (__m512)__a;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512d __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_castbf16_pd(__m512bh __a) {
|
|
||||||
return (__m512d)__a;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_castbf16_si512(__m512bh __a) {
|
|
||||||
return (__m512i)__a;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) {
|
|
||||||
return (__m512bh)__a;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_castpd_pbh(__m512d __a) {
|
|
||||||
return (__m512bh)__a;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_castsi512_pbh(__m512i __a) {
|
|
||||||
return (__m512bh)__a;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_castbf16512_pbh128(__m512bh __a) {
|
|
||||||
return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_castbf16512_pbh256(__m512bh __a) {
|
|
||||||
return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
|
||||||
12, 13, 14, 15);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_castbf16128_pbh512(__m128bh __a) {
|
|
||||||
return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
|
|
||||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
||||||
-1, -1, -1, -1, -1, -1, -1, -1, -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_castbf16256_pbh512(__m256bh __a) {
|
|
||||||
return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
|
||||||
12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
||||||
-1, -1, -1, -1, -1, -1, -1, -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_zextbf16128_pbh512(__m128bh __a) {
|
|
||||||
return __builtin_shufflevector(
|
|
||||||
__a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
|
|
||||||
13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_zextbf16256_pbh512(__m256bh __a) {
|
|
||||||
return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3,
|
|
||||||
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
|
||||||
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
|
|
||||||
29, 30, 31);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) {
|
|
||||||
return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF),
|
|
||||||
(__m512i)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_load_pbh(void const *__p) {
|
|
||||||
return *(const __m512bh *)__p;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_loadu_pbh(void const *__p) {
|
|
||||||
struct __loadu_pbh {
|
|
||||||
__m512bh_u __v;
|
|
||||||
} __attribute__((__packed__, __may_alias__));
|
|
||||||
return ((const struct __loadu_pbh *)__p)->__v;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P,
|
|
||||||
__m512bh __A) {
|
|
||||||
*(__m512bh *)__P = __A;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P,
|
|
||||||
__m512bh __A) {
|
|
||||||
struct __storeu_pbh {
|
|
||||||
__m512bh_u __v;
|
|
||||||
} __attribute__((__packed__, __may_alias__));
|
|
||||||
((struct __storeu_pbh *)__P)->__v = __A;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W,
|
|
||||||
(__v32bf)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
|
|
||||||
(__v32hi)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_add_pbh(__m512bh __A,
|
|
||||||
__m512bh __B) {
|
|
||||||
return (__m512bh)((__v32bf)__A + (__v32bf)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_add_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B), (__v32bf)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_add_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sub_pbh(__m512bh __A,
|
|
||||||
__m512bh __B) {
|
|
||||||
return (__m512bh)((__v32bf)__A - (__v32bf)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_sub_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B), (__v32bf)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_sub_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mul_pbh(__m512bh __A,
|
|
||||||
__m512bh __B) {
|
|
||||||
return (__m512bh)((__v32bf)__A * (__v32bf)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_mul_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B), (__v32bf)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_mul_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_div_pbh(__m512bh __A,
|
|
||||||
__m512bh __B) {
|
|
||||||
return (__m512bh)((__v32bf)__A / (__v32bf)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_div_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B), (__v32bf)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_div_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_max_pbh(__m512bh __A,
|
|
||||||
__m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_vmaxbf16512((__v32bf)__A, (__v32bf)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_max_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), (__v32bf)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_max_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_min_pbh(__m512bh __A,
|
|
||||||
__m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_vminbf16512((__v32bf)__A, (__v32bf)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_min_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), (__v32bf)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_min_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_cmp_pbh_mask(__A, __B, __P) \
|
|
||||||
((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A), \
|
|
||||||
(__v32bf)(__m512bh)(__B), \
|
|
||||||
(int)(__P), (__mmask32) - 1))
|
|
||||||
|
|
||||||
#define _mm512_mask_cmp_pbh_mask(__U, __A, __B, __P) \
|
|
||||||
((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A), \
|
|
||||||
(__v32bf)(__m512bh)(__B), \
|
|
||||||
(int)(__P), (__mmask32)(__U)))
|
|
||||||
|
|
||||||
#define _mm512_mask_fpclass_pbh_mask(__U, __A, imm) \
|
|
||||||
((__mmask32)__builtin_ia32_vfpclassbf16512_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32)(__U)))
|
|
||||||
|
|
||||||
#define _mm512_fpclass_pbh_mask(__A, imm) \
|
|
||||||
((__mmask32)__builtin_ia32_vfpclassbf16512_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32) - 1))
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_scalef_pbh(__m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
|
|
||||||
(__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_undefined_pbh(),
|
|
||||||
(__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_scalef_pbh(
|
|
||||||
__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
|
|
||||||
(__v32bf)__A, (__v32bf)__B, (__v32bf)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_scalef_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
|
||||||
return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
|
|
||||||
(__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_setzero_pbh(),
|
|
||||||
(__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_rcp_pbh(__m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
|
|
||||||
(__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_rcp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_vrcpbf16512_mask((__v32bf)__A, (__v32bf)__W,
|
|
||||||
(__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_rcp_pbh(__mmask32 __U, __m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
|
|
||||||
(__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_getexp_pbh(__m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
|
|
||||||
(__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_getexp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
|
|
||||||
(__v32bf)__A, (__v32bf)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_getexp_pbh(__mmask32 __U, __m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
|
|
||||||
(__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_rsqrt_pbh(__m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
|
|
||||||
(__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_rsqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask((__v32bf)__A, (__v32bf)__W,
|
|
||||||
(__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
|
|
||||||
(__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_reduce_pbh(__A, imm) \
|
|
||||||
((__m512bh)__builtin_ia32_vreducebf16512_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_undefined_pbh(), \
|
|
||||||
(__mmask32) - 1))
|
|
||||||
|
|
||||||
#define _mm512_mask_reduce_pbh(__W, __U, __A, imm) \
|
|
||||||
((__m512bh)__builtin_ia32_vreducebf16512_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \
|
|
||||||
(__mmask32)(__U)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_reduce_pbh(__U, __A, imm) \
|
|
||||||
((__m512bh)__builtin_ia32_vreducebf16512_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
|
|
||||||
(__mmask32)(__U)))
|
|
||||||
|
|
||||||
#define _mm512_roundscale_pbh(__A, imm) \
|
|
||||||
((__m512bh)__builtin_ia32_vrndscalebf16_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
|
|
||||||
(__mmask32) - 1))
|
|
||||||
|
|
||||||
#define _mm512_mask_roundscale_pbh(__W, __U, __A, imm) \
|
|
||||||
((__m512bh)__builtin_ia32_vrndscalebf16_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \
|
|
||||||
(__mmask32)(__U)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_roundscale_pbh(__U, __A, imm) \
|
|
||||||
((__m512bh)__builtin_ia32_vrndscalebf16_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
|
|
||||||
(__mmask32)(__U)))
|
|
||||||
|
|
||||||
#define _mm512_getmant_pbh(__A, __B, __C) \
|
|
||||||
((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
|
|
||||||
(__v32bf)_mm512_undefined_pbh(), (__mmask32) - 1))
|
|
||||||
|
|
||||||
#define _mm512_mask_getmant_pbh(__W, __U, __A, __B, __C) \
|
|
||||||
((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
|
|
||||||
(__v32bf)(__m512bh)(__W), (__mmask32)(__U)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_getmant_pbh(__U, __A, __B, __C) \
|
|
||||||
((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \
|
|
||||||
(__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
|
|
||||||
(__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_sqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U, (__v32bf)_mm512_sqrt_pbh(__A), (__v32bf)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
|
|
||||||
(__v32bf)_mm512_sqrt_pbh(__A),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B,
|
|
||||||
(__v32bf)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_fmadd_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmadd_pbh(
|
|
||||||
__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pbh(
|
|
||||||
__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B,
|
|
||||||
-(__v32bf)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_fmsub_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsub_pbh(
|
|
||||||
__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pbh(
|
|
||||||
__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B,
|
|
||||||
(__v32bf)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_pbh(
|
|
||||||
__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
|
||||||
(__v32bf)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmadd_pbh(
|
|
||||||
__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
|
||||||
(__v32bf)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pbh(
|
|
||||||
__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B,
|
|
||||||
-(__v32bf)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_pbh(
|
|
||||||
__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
|
||||||
(__v32bf)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmsub_pbh(
|
|
||||||
__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
|
||||||
(__v32bf)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
|
|
||||||
__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
|
|
||||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
|
||||||
(__mmask32)__U,
|
|
||||||
_mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
|
||||||
(__v32bf)_mm512_setzero_pbh());
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS512
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
@@ -1,322 +0,0 @@
|
|||||||
/*===--------- avx10_2_512convertintrin.h - AVX10_2_512CONVERT -------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <avx10_2_512convertintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif // __IMMINTRIN_H
|
|
||||||
|
|
||||||
#ifdef __SSE2__
|
|
||||||
|
|
||||||
#ifndef __AVX10_2_512CONVERTINTRIN_H
|
|
||||||
#define __AVX10_2_512CONVERTINTRIN_H
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#define __DEFAULT_FN_ATTRS512 \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
|
|
||||||
__min_vector_width__(512)))
|
|
||||||
|
|
||||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtx2ps_ph(__m512 __A,
|
|
||||||
__m512 __B) {
|
|
||||||
return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
|
|
||||||
(__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)(-1),
|
|
||||||
_MM_FROUND_CUR_DIRECTION);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvtx2ps_ph(__m512h __W, __mmask32 __U, __m512 __A, __m512 __B) {
|
|
||||||
return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
|
|
||||||
(__v16sf)__A, (__v16sf)__B, (__v32hf)__W, (__mmask32)__U,
|
|
||||||
_MM_FROUND_CUR_DIRECTION);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvtx2ps_ph(__mmask32 __U, __m512 __A, __m512 __B) {
|
|
||||||
return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
|
|
||||||
(__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
|
|
||||||
_MM_FROUND_CUR_DIRECTION);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_cvtx_round2ps_ph(A, B, R) \
|
|
||||||
((__m512h)__builtin_ia32_vcvt2ps2phx512_mask( \
|
|
||||||
(__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_undefined_ph(), \
|
|
||||||
(__mmask32)(-1), (const int)(R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_cvtx_round2ps_ph(W, U, A, B, R) \
|
|
||||||
((__m512h)__builtin_ia32_vcvt2ps2phx512_mask((__v16sf)(A), (__v16sf)(B), \
|
|
||||||
(__v32hf)(W), (__mmask32)(U), \
|
|
||||||
(const int)(R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_cvtx_round2ps_ph(U, A, B, R) \
|
|
||||||
((__m512h)__builtin_ia32_vcvt2ps2phx512_mask( \
|
|
||||||
(__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_setzero_ph(), \
|
|
||||||
(__mmask32)(U), (const int)(R)))
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_cvtbiasph_bf8(__m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
|
|
||||||
(__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_bf8(
|
|
||||||
__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvtbiasph_bf8(__mmask32 __U, __m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
|
|
||||||
(__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_cvts_biasph_bf8(__m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
|
|
||||||
(__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvts_biasph_bf8(
|
|
||||||
__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvts_biasph_bf8(__mmask32 __U, __m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
|
|
||||||
(__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_cvtbiasph_hf8(__m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
|
|
||||||
(__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_hf8(
|
|
||||||
__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvtbiasph_hf8(__mmask32 __U, __m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
|
|
||||||
(__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_cvts_biasph_hf8(__m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
|
|
||||||
(__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvts_biasph_hf8(
|
|
||||||
__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvts_biasph_hf8(__mmask32 __U, __m512i __A, __m512h __B) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
|
|
||||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
|
|
||||||
(__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvt2ph_bf8(__m512h __A,
|
|
||||||
__m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_vcvt2ph2bf8_512((__v32hf)(__A),
|
|
||||||
(__v32hf)(__B));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvt2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectb_512(
|
|
||||||
(__mmask64)__U, (__v64qi)_mm512_cvt2ph_bf8(__A, __B), (__v64qi)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvt2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectb_512(
|
|
||||||
(__mmask64)__U, (__v64qi)_mm512_cvt2ph_bf8(__A, __B),
|
|
||||||
(__v64qi)(__m512i)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_cvts_2ph_bf8(__m512h __A, __m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_vcvt2ph2bf8s_512((__v32hf)(__A),
|
|
||||||
(__v32hf)(__B));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvts_2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectb_512(
|
|
||||||
(__mmask64)__U, (__v64qi)_mm512_cvts_2ph_bf8(__A, __B), (__v64qi)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvts_2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectb_512(
|
|
||||||
(__mmask64)__U, (__v64qi)_mm512_cvts_2ph_bf8(__A, __B),
|
|
||||||
(__v64qi)(__m512i)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvt2ph_hf8(__m512h __A,
|
|
||||||
__m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_vcvt2ph2hf8_512((__v32hf)(__A),
|
|
||||||
(__v32hf)(__B));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvt2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectb_512(
|
|
||||||
(__mmask64)__U, (__v64qi)_mm512_cvt2ph_hf8(__A, __B), (__v64qi)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvt2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectb_512(
|
|
||||||
(__mmask64)__U, (__v64qi)_mm512_cvt2ph_hf8(__A, __B),
|
|
||||||
(__v64qi)(__m512i)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_cvts_2ph_hf8(__m512h __A, __m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_vcvt2ph2hf8s_512((__v32hf)(__A),
|
|
||||||
(__v32hf)(__B));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvts_2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectb_512(
|
|
||||||
(__mmask64)__U, (__v64qi)_mm512_cvts_2ph_hf8(__A, __B), (__v64qi)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvts_2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectb_512(
|
|
||||||
(__mmask64)__U, (__v64qi)_mm512_cvts_2ph_hf8(__A, __B),
|
|
||||||
(__v64qi)(__m512i)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvthf8_ph(__m256i __A) {
|
|
||||||
return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
|
|
||||||
(__v32qi)__A, (__v32hf)(__m512h)_mm512_undefined_ph(), (__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvthf8_ph(__m512h __W, __mmask32 __U, __m256i __A) {
|
|
||||||
return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
|
|
||||||
(__v32qi)__A, (__v32hf)(__m512h)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvthf8_ph(__mmask32 __U, __m256i __A) {
|
|
||||||
return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
|
|
||||||
(__v32qi)__A, (__v32hf)(__m512h)_mm512_setzero_ph(), (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtph_bf8(__m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvtph_bf8(__m256i __W, __mmask32 __U, __m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvtph_bf8(__mmask32 __U, __m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_cvts_ph_bf8(__m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvts_ph_bf8(__m256i __W, __mmask32 __U, __m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvts_ph_bf8(__mmask32 __U, __m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtph_hf8(__m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvtph_hf8(__m256i __W, __mmask32 __U, __m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvtph_hf8(__mmask32 __U, __m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_cvts_ph_hf8(__m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvts_ph_hf8(__m256i __W, __mmask32 __U, __m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvts_ph_hf8(__mmask32 __U, __m512h __A) {
|
|
||||||
return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask(
|
|
||||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtbf8_ph(__m256i __A) {
|
|
||||||
return _mm512_castsi512_ph(_mm512_slli_epi16(_mm512_cvtepi8_epi16(__A), 8));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline __m512h __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_mask_cvtbf8_ph(__m512h __S, __mmask32 __U, __m256i __A) {
|
|
||||||
return _mm512_castsi512_ph(
|
|
||||||
_mm512_mask_slli_epi16((__m512i)__S, __U, _mm512_cvtepi8_epi16(__A), 8));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline __m512h __DEFAULT_FN_ATTRS512
|
|
||||||
_mm512_maskz_cvtbf8_ph(__mmask32 __U, __m256i __A) {
|
|
||||||
return _mm512_castsi512_ph(
|
|
||||||
_mm512_slli_epi16(_mm512_maskz_cvtepi8_epi16(__U, __A), 8));
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS512
|
|
||||||
|
|
||||||
#endif // __AVX10_2_512CONVERTINTRIN_H
|
|
||||||
#endif // __SSE2__
|
|
||||||
@@ -1,127 +0,0 @@
|
|||||||
/*===---- avx10_2_512minmaxintrin.h - AVX10_2_512MINMAX intrinsics ---------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <avx10_2_512minmaxintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif // __IMMINTRIN_H
|
|
||||||
|
|
||||||
#ifndef __AVX10_2_512MINMAXINTRIN_H
|
|
||||||
#define __AVX10_2_512MINMAXINTRIN_H
|
|
||||||
|
|
||||||
#define _mm512_minmax_pbh(A, B, C) \
|
|
||||||
((__m512bh)__builtin_ia32_vminmaxbf16512((__v32bf)(__m512bh)(A), \
|
|
||||||
(__v32bf)(__m512bh)(A), (int)(C)))
|
|
||||||
|
|
||||||
#define _mm512_mask_minmax_pbh(W, U, A, B, C) \
|
|
||||||
((__m512bh)__builtin_ia32_selectpbf_512( \
|
|
||||||
(__mmask32)(U), \
|
|
||||||
(__v32bf)_mm512_minmax_pbh((__v32bf)(__m512bh)(A), \
|
|
||||||
(__v32bf)(__m512bh)(B), (int)(C)), \
|
|
||||||
(__v32bf)(__m512bh)(W)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_minmax_pbh(U, A, B, C) \
|
|
||||||
((__m512bh)__builtin_ia32_selectpbf_512( \
|
|
||||||
(__mmask32)(U), \
|
|
||||||
(__v32bf)_mm512_minmax_pbh((__v32bf)(__m512bh)(A), \
|
|
||||||
(__v32bf)(__m512bh)(B), (int)(C)), \
|
|
||||||
(__v32bf) __builtin_bit_cast(__m512bh, _mm512_setzero_ps())))
|
|
||||||
|
|
||||||
#define _mm512_minmax_pd(A, B, C) \
|
|
||||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
|
||||||
(__v8df)_mm512_undefined_pd(), (__mmask8)-1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_minmax_pd(W, U, A, B, C) \
|
|
||||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
|
||||||
(__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_minmax_pd(U, A, B, C) \
|
|
||||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
|
||||||
(__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_minmax_round_pd(A, B, C, R) \
|
|
||||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
|
||||||
(__v8df)_mm512_undefined_pd(), (__mmask8)-1, (int)(R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_minmax_round_pd(W, U, A, B, C, R) \
|
|
||||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
|
||||||
(__v8df)(__m512d)(W), (__mmask8)(U), (int)(R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_minmax_round_pd(U, A, B, C, R) \
|
|
||||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
|
||||||
(__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
|
|
||||||
|
|
||||||
#define _mm512_minmax_ph(A, B, C) \
|
|
||||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
|
||||||
(__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_minmax_ph(W, U, A, B, C) \
|
|
||||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
|
||||||
(__v32hf)(__m512h)(W), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_minmax_ph(U, A, B, C) \
|
|
||||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
|
||||||
(__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_minmax_round_ph(A, B, C, R) \
|
|
||||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
|
||||||
(__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_minmax_round_ph(W, U, A, B, C, R) \
|
|
||||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
|
||||||
(__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_minmax_round_ph(U, A, B, C, R) \
|
|
||||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
|
||||||
(__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
|
|
||||||
|
|
||||||
#define _mm512_minmax_ps(A, B, C) \
|
|
||||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
|
|
||||||
(__v16sf)_mm512_undefined_ps(), (__mmask16)-1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_minmax_ps(W, U, A, B, C) \
|
|
||||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
|
|
||||||
(__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_minmax_ps(U, A, B, C) \
|
|
||||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
|
|
||||||
(__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_minmax_round_ps(A, B, C, R) \
|
|
||||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
|
|
||||||
(__v16sf)_mm512_undefined_ps(), (__mmask16)-1, (int)(R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_minmax_round_ps(W, U, A, B, C, R) \
|
|
||||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
|
|
||||||
(__mmask16)(U), (int)(R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_minmax_round_ps(U, A, B, C, R) \
|
|
||||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
|
|
||||||
(__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
|
|
||||||
#endif // __AVX10_2_512MINMAXINTRIN_H
|
|
||||||
@@ -1,314 +0,0 @@
|
|||||||
/*===---- avx10_2_512niintrin.h - AVX10.2-512 new instruction intrinsics ---===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <avx10_2_512niintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __SSE2__
|
|
||||||
|
|
||||||
#ifndef __AVX10_2_512NIINTRIN_H
|
|
||||||
#define __AVX10_2_512NIINTRIN_H
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
|
|
||||||
__min_vector_width__(512)))
|
|
||||||
|
|
||||||
/* VNNI FP16 */
|
|
||||||
static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_dpph_ps(__m512 __W,
|
|
||||||
__m512h __A,
|
|
||||||
__m512h __B) {
|
|
||||||
return (__m512)__builtin_ia32_vdpphps512((__v16sf)__W, (__v32hf)__A,
|
|
||||||
(__v32hf)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_dpph_ps(__m512 __W,
|
|
||||||
__mmask16 __U,
|
|
||||||
__m512h __A,
|
|
||||||
__m512h __B) {
|
|
||||||
return (__m512)__builtin_ia32_selectps_512(
|
|
||||||
(__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), (__v16sf)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U,
|
|
||||||
__m512 __W,
|
|
||||||
__m512h __A,
|
|
||||||
__m512h __B) {
|
|
||||||
return (__m512)__builtin_ia32_selectps_512(
|
|
||||||
(__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B),
|
|
||||||
(__v16sf)_mm512_setzero_ps());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* VMPSADBW */
|
|
||||||
#define _mm512_mpsadbw_epu8(A, B, imm) \
|
|
||||||
((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
|
|
||||||
(__v64qi)(__m512i)(B), (int)(imm)))
|
|
||||||
|
|
||||||
#define _mm512_mask_mpsadbw_epu8(W, U, A, B, imm) \
|
|
||||||
((__m512i)__builtin_ia32_selectw_512( \
|
|
||||||
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
|
|
||||||
(__v32hi)(__m512i)(W)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_mpsadbw_epu8(U, A, B, imm) \
|
|
||||||
((__m512i)__builtin_ia32_selectw_512( \
|
|
||||||
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
|
|
||||||
(__v32hi)_mm512_setzero_si512()))
|
|
||||||
|
|
||||||
/* VNNI INT8 */
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W,
|
|
||||||
__m512i __A,
|
|
||||||
__m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A,
|
|
||||||
(__v16si)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), (__v16si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32(
|
|
||||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W,
|
|
||||||
__m512i __A,
|
|
||||||
__m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A,
|
|
||||||
(__v16si)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32(
|
|
||||||
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), (__v16si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32(
|
|
||||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W,
|
|
||||||
__m512i __A,
|
|
||||||
__m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A,
|
|
||||||
(__v16si)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), (__v16si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32(
|
|
||||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W,
|
|
||||||
__m512i __A,
|
|
||||||
__m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A,
|
|
||||||
(__v16si)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32(
|
|
||||||
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), (__v16si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32(
|
|
||||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W,
|
|
||||||
__m512i __A,
|
|
||||||
__m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A,
|
|
||||||
(__v16si)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), (__v16si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32(
|
|
||||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W,
|
|
||||||
__m512i __A,
|
|
||||||
__m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A,
|
|
||||||
(__v16si)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32(
|
|
||||||
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), (__v16si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
|
|
||||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
__U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* VNNI INT16 */
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
|
|
||||||
__m512i __B,
|
|
||||||
__m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
|
|
||||||
(__v16si)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
|
|
||||||
(__v16si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
|
|
||||||
__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
|
|
||||||
__m512i __B,
|
|
||||||
__m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
|
|
||||||
(__v16si)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
|
|
||||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
|
|
||||||
(__v16si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
|
|
||||||
__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
|
|
||||||
__m512i __B,
|
|
||||||
__m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
|
|
||||||
(__v16si)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
|
|
||||||
(__v16si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
|
|
||||||
__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
|
|
||||||
__m512i __B,
|
|
||||||
__m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
|
|
||||||
(__v16si)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
|
|
||||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
|
|
||||||
(__v16si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
|
|
||||||
__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
|
|
||||||
__m512i __B,
|
|
||||||
__m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
|
|
||||||
(__v16si)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
|
|
||||||
(__v16si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
|
|
||||||
__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
|
|
||||||
__m512i __B,
|
|
||||||
__m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
|
|
||||||
(__v16si)__C);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
|
|
||||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
|
|
||||||
(__v16si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32(
|
|
||||||
__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
|
|
||||||
return (__m512i)__builtin_ia32_selectd_512(
|
|
||||||
(__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
|
|
||||||
(__v16si)_mm512_setzero_si512());
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
|
|
||||||
#endif /* __SSE2__ */
|
|
||||||
#endif /* __AVX10_2_512NIINTRIN_H */
|
|
||||||
@@ -1,307 +0,0 @@
|
|||||||
/*===----- avx10_2_512satcvtdsintrin.h - AVX10_2_512SATCVTDS intrinsics ----===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <avx10_2_512satcvtdsintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __AVX10_2_512SATCVTDSINTRIN_H
|
|
||||||
#define __AVX10_2_512SATCVTDSINTRIN_H
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#define __DEFAULT_FN_ATTRS \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
|
|
||||||
__min_vector_width__(512)))
|
|
||||||
|
|
||||||
// 512 bit : Double -> Int
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_cvtts_pd_epi32(__m512d __A) {
|
|
||||||
return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_cvtts_pd_epi32(__m256i __W, __mmask8 __U, __m512d __A) {
|
|
||||||
return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_maskz_cvtts_pd_epi32(__mmask8 __U, __m512d __A) {
|
|
||||||
return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8si)_mm256_setzero_si256(), __U,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_cvtts_roundpd_epi32(__A, __R) \
|
|
||||||
((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8si)_mm256_undefined_si256(), \
|
|
||||||
(__mmask8) - 1, (const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_cvtts_roundpd_epi32(__W, __U, __A, __R) \
|
|
||||||
((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8si)(__m256i)(__W), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_cvtts_roundpd_epi32(__U, __A, __R) \
|
|
||||||
((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8si)_mm256_setzero_si256(), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
// 512 bit : Double -> uInt
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_cvtts_pd_epu32(__m512d __A) {
|
|
||||||
return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_cvtts_pd_epu32(__m256i __W, __mmask8 __U, __m512d __A) {
|
|
||||||
return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_maskz_cvtts_pd_epu32(__mmask8 __U, __m512d __A) {
|
|
||||||
return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8si)_mm256_setzero_si256(), __U,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_cvtts_roundpd_epu32(__A, __R) \
|
|
||||||
((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8si)_mm256_undefined_si256(), \
|
|
||||||
(__mmask8) - 1, (const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_cvtts_roundpd_epu32(__W, __U, __A, __R) \
|
|
||||||
((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8si)(__m256i)(__W), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_cvtts_roundpd_epu32(__U, __A, __R) \
|
|
||||||
((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8si)_mm256_setzero_si256(), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
// 512 bit : Double -> Long
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_cvtts_pd_epi64(__m512d __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_cvtts_pd_epi64(__m512i __W, __mmask8 __U, __m512d __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_maskz_cvtts_pd_epi64(__mmask8 __U, __m512d __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8di)_mm512_setzero_si512(), __U,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_cvtts_roundpd_epi64(__A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8di)_mm512_undefined_epi32(), \
|
|
||||||
(__mmask8) - 1, (const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_cvtts_roundpd_epi64(__W, __U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_cvtts_roundpd_epi64(__U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
// 512 bit : Double -> ULong
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_cvtts_pd_epu64(__m512d __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_cvtts_pd_epu64(__m512i __W, __mmask8 __U, __m512d __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_maskz_cvtts_pd_epu64(__mmask8 __U, __m512d __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
|
|
||||||
(__v8df)__A, (__v8di)_mm512_setzero_si512(), __U,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_cvtts_roundpd_epu64(__A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8di)_mm512_undefined_epi32(), \
|
|
||||||
(__mmask8) - 1, (const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_cvtts_roundpd_epu64(__W, __U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_cvtts_roundpd_epu64(__U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \
|
|
||||||
(__v8df)(__m512d)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
// 512 bit: Float -> int
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvtts_ps_epi32(__m512 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
|
|
||||||
(__v16sf)(__A), (__v16si)_mm512_undefined_epi32(), (__mmask16)-1,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_cvtts_ps_epi32(__m512i __W, __mmask16 __U, __m512 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
|
|
||||||
(__v16sf)(__A), (__v16si)(__W), __U, _MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_maskz_cvtts_ps_epi32(__mmask16 __U, __m512 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
|
|
||||||
(__v16sf)(__A), (__v16si)_mm512_setzero_si512(), __U,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_cvtts_roundps_epi32(__A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(__A), (__v16si)_mm512_undefined_epi32(), \
|
|
||||||
(__mmask16) - 1, (const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_cvtts_roundps_epi32(__W, __U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(__A), (__v16si)(__m512i)(__W), (__mmask16)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_cvtts_roundps_epi32(__U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(__A), (__v16si)_mm512_setzero_si512(), \
|
|
||||||
(__mmask16)(__U), (const int)(__R)))
|
|
||||||
|
|
||||||
// 512 bit: Float -> uint
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvtts_ps_epu32(__m512 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
|
|
||||||
(__v16sf)(__A), (__v16si)_mm512_undefined_epi32(), (__mmask16)-1,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_cvtts_ps_epu32(__m512i __W, __mmask16 __U, __m512 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
|
|
||||||
(__v16sf)(__A), (__v16si)(__W), __U, _MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_maskz_cvtts_ps_epu32(__mmask16 __U, __m512 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
|
|
||||||
(__v16sf)(__A), (__v16si)_mm512_setzero_si512(), __U,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_cvtts_roundps_epu32(__A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(__A), (__v16si)_mm512_undefined_epi32(), \
|
|
||||||
(__mmask16) - 1, (const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_cvtts_roundps_epu32(__W, __U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(__A), (__v16si)(__m512i)(__W), (__mmask16)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_cvtts_roundps_epu32(__U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \
|
|
||||||
(__v16sf)(__m512)(__A), (__v16si)_mm512_setzero_si512(), \
|
|
||||||
(__mmask16)(__U), (const int)(__R)))
|
|
||||||
|
|
||||||
// 512 bit : float -> long
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvtts_ps_epi64(__m256 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
|
|
||||||
(__v8sf)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_cvtts_ps_epi64(__m512i __W, __mmask8 __U, __m256 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
|
|
||||||
(__v8sf)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_maskz_cvtts_ps_epi64(__mmask8 __U, __m256 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
|
|
||||||
(__v8sf)__A, (__v8di)_mm512_setzero_si512(), __U,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_cvtts_roundps_epi64(__A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \
|
|
||||||
(__v8sf)(__m256)(__A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_cvtts_roundps_epi64(__W, __U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \
|
|
||||||
(__v8sf)(__m256)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_cvtts_roundps_epi64(__U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \
|
|
||||||
(__v8sf)(__m256)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
// 512 bit : float -> ulong
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvtts_ps_epu64(__m256 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
|
|
||||||
(__v8sf)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_mask_cvtts_ps_epu64(__m512i __W, __mmask8 __U, __m256 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
|
|
||||||
(__v8sf)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|
||||||
_mm512_maskz_cvtts_ps_epu64(__mmask8 __U, __m256 __A) {
|
|
||||||
return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
|
|
||||||
(__v8sf)__A, (__v8di)_mm512_setzero_si512(), __U,
|
|
||||||
_MM_FROUND_CUR_DIRECTION));
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _mm512_cvtts_roundps_epu64(__A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \
|
|
||||||
(__v8sf)(__m256)(__A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_mask_cvtts_roundps_epu64(__W, __U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \
|
|
||||||
(__v8sf)(__m256)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_cvtts_roundps_epu64(__U, __A, __R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \
|
|
||||||
(__v8sf)(__m256)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \
|
|
||||||
(const int)(__R)))
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
|
||||||
#endif // __AVX10_2_512SATCVTDSINTRIN_H
|
|
||||||
@@ -1,301 +0,0 @@
|
|||||||
/*===------ avx10_2_512satcvtintrin.h - AVX10_2_512SATCVT intrinsics -------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <avx10_2_512satcvtintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif // __IMMINTRIN_H
|
|
||||||
|
|
||||||
#ifndef __AVX10_2_512SATCVTINTRIN_H
|
|
||||||
#define __AVX10_2_512SATCVTINTRIN_H
|
|
||||||
|
|
||||||
#define _mm512_ipcvts_bf16_epi8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtbf162ibs512((__v32bf)(__m512bh)(A)))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvts_bf16_epi8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
|
||||||
(__v32hi)_mm512_ipcvts_bf16_epi8(A), \
|
|
||||||
(__v32hi)(__m512i)(W)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvts_bf16_epi8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
|
||||||
(__v32hi)_mm512_ipcvts_bf16_epi8(A), \
|
|
||||||
(__v32hi)_mm512_setzero_si512()))
|
|
||||||
|
|
||||||
#define _mm512_ipcvts_bf16_epu8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtbf162iubs512((__v32bf)(__m512bh)(A)))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvts_bf16_epu8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
|
||||||
(__v32hi)_mm512_ipcvts_bf16_epu8(A), \
|
|
||||||
(__v32hi)(__m512i)(W)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvts_bf16_epu8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
|
||||||
(__v32hi)_mm512_ipcvts_bf16_epu8(A), \
|
|
||||||
(__v32hi)_mm512_setzero_si512()))
|
|
||||||
|
|
||||||
#define _mm512_ipcvtts_bf16_epi8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttbf162ibs512((__v32bf)(__m512bh)(A)))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvtts_bf16_epi8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
|
||||||
(__v32hi)_mm512_ipcvtts_bf16_epi8(A), \
|
|
||||||
(__v32hi)(__m512i)(W)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvtts_bf16_epi8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
|
||||||
(__v32hi)_mm512_ipcvtts_bf16_epi8(A), \
|
|
||||||
(__v32hi)_mm512_setzero_si512()))
|
|
||||||
|
|
||||||
#define _mm512_ipcvtts_bf16_epu8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttbf162iubs512((__v32bf)(__m512bh)(A)))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvtts_bf16_epu8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
|
||||||
(__v32hi)_mm512_ipcvtts_bf16_epu8(A), \
|
|
||||||
(__v32hi)(__m512i)(W)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvtts_bf16_epu8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
|
||||||
(__v32hi)_mm512_ipcvtts_bf16_epu8(A), \
|
|
||||||
(__v32hi)_mm512_setzero_si512()))
|
|
||||||
|
|
||||||
#define _mm512_ipcvts_ph_epi8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32) - 1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvts_ph_epi8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A), \
|
|
||||||
(__v32hu)(W), (__mmask32)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvts_ph_epi8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_ipcvts_roundph_epi8(A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A), \
|
|
||||||
(__v32hu)_mm512_setzero_si512(), \
|
|
||||||
(__mmask32) - 1, (const int)R))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvts_roundph_epi8(W, U, A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), (const int)R))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvts_roundph_epi8(U, A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A), \
|
|
||||||
(__v32hu)_mm512_setzero_si512(), \
|
|
||||||
(__mmask32)(U), (const int)R))
|
|
||||||
|
|
||||||
#define _mm512_ipcvts_ph_epu8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32) - 1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvts_ph_epu8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask((__v32hf)(__m512h)(A), \
|
|
||||||
(__v32hu)(W), (__mmask32)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvts_ph_epu8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_ipcvts_roundph_epu8(A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32) - 1, \
|
|
||||||
(const int)R))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvts_roundph_epu8(W, U, A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), (const int)R))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvts_roundph_epu8(U, A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
|
||||||
(const int)R))
|
|
||||||
|
|
||||||
#define _mm512_ipcvts_ps_epi8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16) - 1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvts_ps_epi8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A), \
|
|
||||||
(__v16su)(W), (__mmask16)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvts_ps_epi8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_ipcvts_roundps_epi8(A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A), \
|
|
||||||
(__v16su)_mm512_setzero_si512(), \
|
|
||||||
(__mmask16) - 1, (const int)R))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvts_roundps_epi8(W, U, A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16su)(W), (__mmask16)(U), (const int)R))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvts_roundps_epi8(U, A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A), \
|
|
||||||
(__v16su)_mm512_setzero_si512(), \
|
|
||||||
(__mmask16)(U), (const int)R))
|
|
||||||
|
|
||||||
#define _mm512_ipcvts_ps_epu8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16) - 1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvts_ps_epu8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask((__v16sf)(__m512)(A), \
|
|
||||||
(__v16su)(W), (__mmask16)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvts_ps_epu8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_ipcvts_roundps_epu8(A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16) - 1, \
|
|
||||||
(const int)R))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvts_roundps_epu8(W, U, A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16su)(W), (__mmask16)(U), (const int)R))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvts_roundps_epu8(U, A, R) \
|
|
||||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \
|
|
||||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
|
||||||
(const int)R))
|
|
||||||
|
|
||||||
#define _mm512_ipcvtts_ph_epi8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32) - 1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvtts_ph_epi8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask((__v32hf)(__m512h)(A), \
|
|
||||||
(__v32hu)(W), (__mmask32)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvtts_ph_epi8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_ipcvtts_roundph_epi8(A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32) - 1, \
|
|
||||||
S))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvtts_roundph_epi8(W, U, A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), S))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvtts_roundph_epi8(U, A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
|
||||||
S))
|
|
||||||
|
|
||||||
#define _mm512_ipcvtts_ph_epu8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32) - 1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvtts_ph_epu8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask((__v32hf)(__m512h)(A), \
|
|
||||||
(__v32hu)(W), (__mmask32)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvtts_ph_epu8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_ipcvtts_roundph_epu8(A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32) - 1, \
|
|
||||||
S))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvtts_roundph_epu8(W, U, A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), S))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvtts_roundph_epu8(U, A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \
|
|
||||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
|
||||||
S))
|
|
||||||
|
|
||||||
#define _mm512_ipcvtts_ps_epi8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \
|
|
||||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16) - 1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvtts_ps_epi8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask((__v16sf)(__m512h)(A), \
|
|
||||||
(__v16su)(W), (__mmask16)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvtts_ps_epi8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \
|
|
||||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_ipcvtts_roundps_epi8(A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \
|
|
||||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16) - 1, \
|
|
||||||
S))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvtts_roundps_epi8(W, U, A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \
|
|
||||||
(__v16sf)(__m512h)(A), (__v16su)(W), (__mmask16)(U), S))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvtts_roundps_epi8(U, A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \
|
|
||||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
|
||||||
S))
|
|
||||||
|
|
||||||
#define _mm512_ipcvtts_ps_epu8(A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \
|
|
||||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16) - 1, \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvtts_ps_epu8(W, U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask((__v16sf)(__m512h)(A), \
|
|
||||||
(__v16su)(W), (__mmask16)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvtts_ps_epu8(U, A) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \
|
|
||||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
|
||||||
_MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm512_ipcvtts_roundps_epu8(A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \
|
|
||||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16) - 1, \
|
|
||||||
S))
|
|
||||||
|
|
||||||
#define _mm512_mask_ipcvtts_roundps_epu8(W, U, A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \
|
|
||||||
(__v16sf)(__m512h)(A), (__v16su)(W), (__mmask16)(U), S))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ipcvtts_roundps_epu8(U, A, S) \
|
|
||||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \
|
|
||||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
|
||||||
S))
|
|
||||||
|
|
||||||
#endif // __AVX10_2_512SATCVTINTRIN_H
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,66 +0,0 @@
|
|||||||
/*===---- avx10_2copyintrin.h - AVX10.2 Copy intrinsics -------------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <avx10_2copyintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif // __IMMINTRIN_H
|
|
||||||
|
|
||||||
#ifndef __AVX10_2COPYINTRIN_H
|
|
||||||
#define __AVX10_2COPYINTRIN_H
|
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
|
||||||
#define __DEFAULT_FN_ATTRS128 \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
|
|
||||||
__min_vector_width__(128)))
|
|
||||||
|
|
||||||
/// Constructs a 128-bit integer vector, setting the lower 32 bits to the
|
|
||||||
/// lower 32 bits of the parameter \a __A; the upper bits are zeoroed.
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// result[31:0] := __A[31:0]
|
|
||||||
/// result[MAX:32] := 0
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VMOVD </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __A
|
|
||||||
/// A 128-bit integer vector.
|
|
||||||
/// \returns A 128-bit integer vector. The lower 32 bits are copied from the
|
|
||||||
/// parameter \a __A; the upper bits are zeroed.
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi32(__m128i __A) {
|
|
||||||
return (__m128i)__builtin_shufflevector(
|
|
||||||
(__v4si)__A, (__v4si)_mm_setzero_si128(), 0, 4, 4, 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Constructs a 128-bit integer vector, setting the lower 16 bits to the
|
|
||||||
/// lower 16 bits of the parameter \a __A; the upper bits are zeoroed.
|
|
||||||
///
|
|
||||||
/// \code{.operation}
|
|
||||||
/// result[15:0] := __A[15:0]
|
|
||||||
/// result[MAX:16] := 0
|
|
||||||
/// \endcode
|
|
||||||
///
|
|
||||||
/// \headerfile <immintrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VMOVW </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __A
|
|
||||||
/// A 128-bit integer vector.
|
|
||||||
/// \returns A 128-bit integer vector. The lower 16 bits are copied from the
|
|
||||||
/// parameter \a __A; the upper bits are zeroed.
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi16(__m128i __A) {
|
|
||||||
return (__m128i)__builtin_shufflevector(
|
|
||||||
(__v8hi)__A, (__v8hi)_mm_setzero_si128(), 0, 8, 8, 8, 8, 8, 8, 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS128
|
|
||||||
|
|
||||||
#endif // __AVX10_2COPYINTRIN_H
|
|
||||||
@@ -1,232 +0,0 @@
|
|||||||
/*===-------- avx10_2minmaxintrin.h - AVX10_2MINMAX intrinsics -------------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error \
|
|
||||||
"Never use <avx10_2minmaxintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif // __IMMINTRIN_H
|
|
||||||
|
|
||||||
#ifndef __AVX10_2MINMAXINTRIN_H
|
|
||||||
#define __AVX10_2MINMAXINTRIN_H
|
|
||||||
|
|
||||||
#define _mm_minmax_pbh(A, B, C) \
|
|
||||||
((__m128bh)__builtin_ia32_vminmaxbf16128((__m128bh)(__v8bf)(A), \
|
|
||||||
(__m128bh)(__v8bf)(B), (int)(C)))
|
|
||||||
|
|
||||||
#define _mm_mask_minmax_pbh(W, U, A, B, C) \
|
|
||||||
((__m128bh)__builtin_ia32_selectpbf_128( \
|
|
||||||
(__mmask8)(U), \
|
|
||||||
(__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), \
|
|
||||||
(int)(C)), \
|
|
||||||
(__v8bf)(W)))
|
|
||||||
|
|
||||||
#define _mm_maskz_minmax_pbh(U, A, B, C) \
|
|
||||||
((__m128bh)__builtin_ia32_selectpbf_128( \
|
|
||||||
(__mmask8)(U), \
|
|
||||||
(__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), \
|
|
||||||
(int)(C)), \
|
|
||||||
(__v8bf) __builtin_bit_cast(__m128bh, _mm_setzero_ps())))
|
|
||||||
|
|
||||||
#define _mm256_minmax_pbh(A, B, C) \
|
|
||||||
((__m256bh)__builtin_ia32_vminmaxbf16256((__m256bh)(__v16bf)(A), \
|
|
||||||
(__m256bh)(__v16bf)(B), (int)(C)))
|
|
||||||
|
|
||||||
#define _mm256_mask_minmax_pbh(W, U, A, B, C) \
|
|
||||||
((__m256bh)__builtin_ia32_selectpbf_256( \
|
|
||||||
(__mmask16)(U), \
|
|
||||||
(__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A), \
|
|
||||||
(__m256bh)(__v16bf)(B), (int)(C)), \
|
|
||||||
(__v16bf)(W)))
|
|
||||||
|
|
||||||
#define _mm256_maskz_minmax_pbh(U, A, B, C) \
|
|
||||||
((__m256bh)__builtin_ia32_selectpbf_256( \
|
|
||||||
(__mmask16)(U), \
|
|
||||||
(__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A), \
|
|
||||||
(__m256bh)(__v16bf)(B), (int)(C)), \
|
|
||||||
(__v16bf) __builtin_bit_cast(__m256bh, _mm256_setzero_ps())))
|
|
||||||
|
|
||||||
#define _mm_minmax_pd(A, B, C) \
|
|
||||||
((__m128d)__builtin_ia32_vminmaxpd128_mask( \
|
|
||||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
|
||||||
(__v2df)_mm_setzero_pd(), (__mmask8)-1))
|
|
||||||
|
|
||||||
#define _mm_mask_minmax_pd(W, U, A, B, C) \
|
|
||||||
((__m128d)__builtin_ia32_vminmaxpd128_mask( \
|
|
||||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
|
||||||
(__v2df)(__m128d)(W), (__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm_maskz_minmax_pd(U, A, B, C) \
|
|
||||||
((__m128d)__builtin_ia32_vminmaxpd128_mask( \
|
|
||||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
|
||||||
(__v2df)_mm_setzero_pd(), (__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm256_minmax_pd(A, B, C) \
|
|
||||||
((__m256d)__builtin_ia32_vminmaxpd256_mask( \
|
|
||||||
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
|
|
||||||
(__v4df)_mm256_setzero_pd(), (__mmask8)-1))
|
|
||||||
|
|
||||||
#define _mm256_mask_minmax_pd(W, U, A, B, C) \
|
|
||||||
((__m256d)__builtin_ia32_vminmaxpd256_mask( \
|
|
||||||
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
|
|
||||||
(__v4df)(__m256d)(W), (__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm256_maskz_minmax_pd(U, A, B, C) \
|
|
||||||
((__m256d)__builtin_ia32_vminmaxpd256_mask( \
|
|
||||||
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
|
|
||||||
(__v4df)_mm256_setzero_pd(), (__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm_minmax_ph(A, B, C) \
|
|
||||||
((__m128h)__builtin_ia32_vminmaxph128_mask( \
|
|
||||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
|
||||||
(__v8hf)_mm_setzero_ph(), (__mmask8)-1))
|
|
||||||
|
|
||||||
#define _mm_mask_minmax_ph(W, U, A, B, C) \
|
|
||||||
((__m128h)__builtin_ia32_vminmaxph128_mask( \
|
|
||||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
|
||||||
(__v8hf)(__m128h)(W), (__mmask16)-1))
|
|
||||||
|
|
||||||
#define _mm_maskz_minmax_ph(U, A, B, C) \
|
|
||||||
((__m128h)__builtin_ia32_vminmaxph128_mask( \
|
|
||||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
|
||||||
(__v8hf)_mm_setzero_ph(), (__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm256_minmax_ph(A, B, C) \
|
|
||||||
((__m256h)__builtin_ia32_vminmaxph256_mask( \
|
|
||||||
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
|
|
||||||
(__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
|
|
||||||
|
|
||||||
#define _mm256_mask_minmax_ph(W, U, A, B, C) \
|
|
||||||
((__m256h)__builtin_ia32_vminmaxph256_mask( \
|
|
||||||
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
|
|
||||||
(__v16hf)(__m256h)(W), (__mmask16)(U)))
|
|
||||||
|
|
||||||
#define _mm256_maskz_minmax_ph(U, A, B, C) \
|
|
||||||
((__m256h)__builtin_ia32_vminmaxph256_mask( \
|
|
||||||
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
|
|
||||||
(__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
|
|
||||||
|
|
||||||
#define _mm_minmax_ps(A, B, C) \
|
|
||||||
((__m128)__builtin_ia32_vminmaxps128_mask( \
|
|
||||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
|
||||||
(__v4sf)_mm_setzero_ps(), (__mmask8)-1))
|
|
||||||
|
|
||||||
#define _mm_mask_minmax_ps(W, U, A, B, C) \
|
|
||||||
((__m128)__builtin_ia32_vminmaxps128_mask( \
|
|
||||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \
|
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm_maskz_minmax_ps(U, A, B, C) \
|
|
||||||
((__m128)__builtin_ia32_vminmaxps128_mask( \
|
|
||||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
|
||||||
(__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm256_minmax_ps(A, B, C) \
|
|
||||||
((__m256)__builtin_ia32_vminmaxps256_mask( \
|
|
||||||
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
|
|
||||||
(__v8sf)_mm256_setzero_ps(), (__mmask8)-1))
|
|
||||||
|
|
||||||
#define _mm256_mask_minmax_ps(W, U, A, B, C) \
|
|
||||||
((__m256)__builtin_ia32_vminmaxps256_mask( \
|
|
||||||
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
|
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm256_maskz_minmax_ps(U, A, B, C) \
|
|
||||||
((__m256)__builtin_ia32_vminmaxps256_mask( \
|
|
||||||
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
|
|
||||||
(__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm_minmax_sd(A, B, C) \
|
|
||||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
|
||||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
|
||||||
(__v2df)_mm_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm_mask_minmax_sd(W, U, A, B, C) \
|
|
||||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
|
||||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
|
||||||
(__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm_maskz_minmax_sd(U, A, B, C) \
|
|
||||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
|
||||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
|
||||||
(__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm_minmax_round_sd(A, B, C, R) \
|
|
||||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
|
||||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
|
||||||
(__v2df)_mm_undefined_pd(), (__mmask8)-1, (int)(R)))
|
|
||||||
|
|
||||||
#define _mm_mask_minmax_round_sd(W, U, A, B, C, R) \
|
|
||||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
|
||||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
|
||||||
(__v2df)(__m128d)(W), (__mmask8)(U), (int)(R)))
|
|
||||||
|
|
||||||
#define _mm_maskz_minmax_round_sd(U, A, B, C, R) \
|
|
||||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
|
||||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
|
||||||
(__v2df)_mm_setzero_pd(), (__mmask8)(U), (int)(R)))
|
|
||||||
|
|
||||||
#define _mm_minmax_sh(A, B, C) \
|
|
||||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
|
||||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
|
||||||
(__v8hf)_mm_undefined_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm_mask_minmax_sh(W, U, A, B, C) \
|
|
||||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
|
||||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
|
||||||
(__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm_maskz_minmax_sh(U, A, B, C) \
|
|
||||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
|
||||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
|
||||||
(__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm_minmax_round_sh(A, B, C, R) \
|
|
||||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
|
||||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
|
||||||
(__v8hf)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
|
|
||||||
|
|
||||||
#define _mm_mask_minmax_round_sh(W, U, A, B, C, R) \
|
|
||||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
|
||||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
|
||||||
(__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
|
|
||||||
|
|
||||||
#define _mm_maskz_minmax_round_sh(U, A, B, C, R) \
|
|
||||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
|
||||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
|
||||||
(__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
|
|
||||||
|
|
||||||
#define _mm_minmax_ss(A, B, C) \
|
|
||||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
|
||||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
|
||||||
(__v4sf)_mm_undefined_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm_mask_minmax_ss(W, U, A, B, C) \
|
|
||||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
|
||||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \
|
|
||||||
(__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm_maskz_minmax_ss(U, A, B, C) \
|
|
||||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
|
||||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
|
||||||
(__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
|
||||||
|
|
||||||
#define _mm_minmax_round_ss(A, B, C, R) \
|
|
||||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
|
||||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
|
||||||
(__v4sf)_mm_undefined_ps(), (__mmask8)-1, (int)(R)))
|
|
||||||
|
|
||||||
#define _mm_mask_minmax_round_ss(W, U, A, B, C, R) \
|
|
||||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
|
||||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \
|
|
||||||
(__mmask8)(U), (int)(R)))
|
|
||||||
|
|
||||||
#define _mm_maskz_minmax_round_ss(U, A, B, C, R) \
|
|
||||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
|
||||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
|
||||||
(__v4sf)_mm_setzero_ps(), (__mmask8)(U), (int)(R)))
|
|
||||||
#endif // __AVX10_2MINMAXINTRIN_H
|
|
||||||
@@ -1,409 +0,0 @@
|
|||||||
/*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
|
|
||||||
*
|
|
||||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
* See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
*
|
|
||||||
*===-----------------------------------------------------------------------===
|
|
||||||
*/
|
|
||||||
#ifndef __IMMINTRIN_H
|
|
||||||
#error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __SSE2__
|
|
||||||
|
|
||||||
#ifndef __AVX10_2NIINTRIN_H
|
|
||||||
#define __AVX10_2NIINTRIN_H
|
|
||||||
|
|
||||||
#define __DEFAULT_FN_ATTRS128 \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
|
|
||||||
__min_vector_width__(128)))
|
|
||||||
#define __DEFAULT_FN_ATTRS256 \
|
|
||||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
|
|
||||||
__min_vector_width__(256)))
|
|
||||||
|
|
||||||
/* VNNI FP16 */
|
|
||||||
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpph_ps(__m128 __W,
|
|
||||||
__m128h __A,
|
|
||||||
__m128h __B) {
|
|
||||||
return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A,
|
|
||||||
(__v8hf)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpph_ps(__m128 __W,
|
|
||||||
__mmask8 __U,
|
|
||||||
__m128h __A,
|
|
||||||
__m128h __B) {
|
|
||||||
return (__m128)__builtin_ia32_selectps_128(
|
|
||||||
(__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpph_ps(__mmask8 __U,
|
|
||||||
__m128 __W,
|
|
||||||
__m128h __A,
|
|
||||||
__m128h __B) {
|
|
||||||
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
|
|
||||||
(__v4sf)_mm_dpph_ps(__W, __A, __B),
|
|
||||||
(__v4sf)_mm_setzero_ps());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpph_ps(__m256 __W,
|
|
||||||
__m256h __A,
|
|
||||||
__m256h __B) {
|
|
||||||
return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A,
|
|
||||||
(__v16hf)__B);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) {
|
|
||||||
return (__m256)__builtin_ia32_selectps_256(
|
|
||||||
(__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) {
|
|
||||||
return (__m256)__builtin_ia32_selectps_256(
|
|
||||||
(__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B),
|
|
||||||
(__v8sf)_mm256_setzero_ps());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* VMPSADBW */
|
|
||||||
#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
|
|
||||||
((__m128i)__builtin_ia32_selectw_128( \
|
|
||||||
(__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
|
|
||||||
(__v8hi)(__m128i)(W)))
|
|
||||||
|
|
||||||
#define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
|
|
||||||
((__m128i)__builtin_ia32_selectw_128( \
|
|
||||||
(__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
|
|
||||||
(__v8hi)_mm_setzero_si128()))
|
|
||||||
|
|
||||||
#define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
|
|
||||||
((__m256i)__builtin_ia32_selectw_256( \
|
|
||||||
(__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
|
|
||||||
(__v16hi)(__m256i)(W)))
|
|
||||||
|
|
||||||
#define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
|
|
||||||
((__m256i)__builtin_ia32_selectw_256( \
|
|
||||||
(__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
|
|
||||||
(__v16hi)_mm256_setzero_si256()))
|
|
||||||
|
|
||||||
/* VNNI INT8 */
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), (__v4si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), (__v8si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), (__v4si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), (__v8si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbssds_epi32(
|
|
||||||
__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), (__v4si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), (__v8si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), (__v4si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), (__v8si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbsuds_epi32(
|
|
||||||
__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), (__v4si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), (__v8si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), (__v4si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
__U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), (__v8si)__W);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbuuds_epi32(
|
|
||||||
__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
__U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* VNNI INT16 */
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), (__v4si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpwsud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), (__v8si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_maskz_dpwsud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), (__v4si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpwsuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), (__v8si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32(
|
|
||||||
__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), (__v4si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpwusd_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), (__v8si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_maskz_dpwusd_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), (__v4si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpwusds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), (__v8si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32(
|
|
||||||
__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), (__v4si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpwuud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), (__v8si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_maskz_dpwuud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), (__v4si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|
||||||
_mm_maskz_dpwuuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
|
|
||||||
return (__m128i)__builtin_ia32_selectd_128(
|
|
||||||
(__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C),
|
|
||||||
(__v4si)_mm_setzero_si128());
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|
||||||
_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), (__v8si)__A);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32(
|
|
||||||
__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
|
|
||||||
return (__m256i)__builtin_ia32_selectd_256(
|
|
||||||
(__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C),
|
|
||||||
(__v8si)_mm256_setzero_si256());
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS256
|
|
||||||
#undef __DEFAULT_FN_ATTRS128
|
|
||||||
|
|
||||||
#endif /* __AVX10_2NIINTRIN_H */
|
|
||||||
#endif /* __SSE2__ */
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user