// -*- C++ -*-
//===------------------------------ bit ----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===---------------------------------------------------------------------===//

#ifndef _LIBCUDACXX_BIT
#define _LIBCUDACXX_BIT

/*
    bit synopsis

namespace std {

  template <class T>
    constexpr bool has_single_bit(T x) noexcept; // C++20
  template <class T>
    constexpr T bit_ceil(T x);                   // C++20
  template <class T>
    constexpr T bit_floor(T x) noexcept;         // C++20
  template <class T>
    constexpr T bit_width(T x) noexcept;         // C++20

  // 23.20.2, rotating
  template<class T>
    constexpr T rotl(T x, unsigned int s) noexcept; // C++20
  template<class T>
    constexpr T rotr(T x, unsigned int s) noexcept; // C++20

  // 23.20.3, counting
  template<class T>
    constexpr int countl_zero(T x) noexcept;  // C++20
  template<class T>
    constexpr int countl_one(T x) noexcept;   // C++20
  template<class T>
    constexpr int countr_zero(T x) noexcept;  // C++20
  template<class T>
    constexpr int countr_one(T x) noexcept;   // C++20
  template<class T>
    constexpr int popcount(T x) noexcept;     // C++20

  // 20.15.9, endian
  enum class endian {
    little = see below,        // C++20
    big = see below,           // C++20
    native = see below         // C++20
};

} // namespace std

*/

#ifndef __cuda_std__
#include <__config>
#include <__debug>
#endif // __cuda_std__

#include "__assert" // all public C++ headers provide the assertion handler
#include "cstdint"
#include "limits"
#include "type_traits"
#include "version"

#ifndef __cuda_std__
#include <__pragma_push>
#endif //__cuda_std__

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#if defined(_LIBCUDACXX_COMPILER_MSVC)
#include <intrin.h>
#endif // _LIBCUDACXX_COMPILER_MSVC

#if defined(_LIBCUDACXX_COMPILER_IBM)
#include "support/ibm/support.h"
#endif // _LIBCUDACXX_COMPILER_IBM

_LIBCUDACXX_BEGIN_NAMESPACE_STD

#define _LIBCUDACXX_BIT_CONSTEXPR constexpr

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_ctz2(uint64_t __x, int __c)           noexcept {
  return (__x & 0x1) ? __c : __c + 1;
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_ctz4(uint64_t __x, int __c)           noexcept {
  return __binary_ctz2(
            __x >> 2*!(__x & 0x3),
            __c + 2*!(__x & 0x3));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_ctz8(uint64_t __x, int __c)           noexcept {
  return __binary_ctz4(
            __x >> 4*!(__x & 0x0F),
            __c + 4*!(__x & 0x0F));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_ctz16(uint64_t __x, int __c)           noexcept {
  return __binary_ctz8(
            __x >> 8*!(__x & 0x00FF),
            __c + 8*!(__x & 0x00FF));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_ctz32(uint64_t __x, int __c)           noexcept {
  return __binary_ctz16(
            __x >> 16*!(__x & 0x0000FFFF),
            __c + 16*!(__x & 0x0000FFFF));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_ctz64(uint64_t __x)           noexcept {
  return __binary_ctz32(
            __x >> 32*!(__x & 0x00000000FFFFFFFF),
            32*!(__x & 0x00000000FFFFFFFF));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_clz2(uint64_t __x, int __c) {
  return !!(~__x & 0x2) ^ __c;
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_clz4(uint64_t __x, int __c) {
  return __binary_clz2(
      __x >> 2*!!(__x & 0xC),
      __c + 2*!(__x & 0xC));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_clz8(uint64_t __x, int __c) {
  return __binary_clz4(
      __x >> 4*!!(__x & 0xF0),
      __c + 4*!(__x & 0xF0));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_clz16(uint64_t __x, int __c) {
  return __binary_clz8(
      __x >> 8*!!(__x & 0xFF00),
      __c + 8*!(__x & 0xFF00));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_clz32(uint64_t __x, int __c) {
  return __binary_clz16(
      __x >> 16*!!(__x & 0xFFFF0000),
      __c + 16*!(__x & 0xFFFF0000));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __binary_clz64(uint64_t __x) {
  return __binary_clz32(
      __x >> 32*!!(__x & 0xFFFFFFFF00000000),
      32*!(__x & 0xFFFFFFFF00000000));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __fallback_popc8(uint64_t __x) {
  return static_cast<int>((__x * 0x0101010101010101) >> 56);
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __fallback_popc16(uint64_t __x) {
  return __fallback_popc8(
      (__x + (__x >> 4)) & 0x0f0f0f0f0f0f0f0f);
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __fallback_popc32(uint64_t __x) {
  return __fallback_popc16(
      (__x & 0x3333333333333333) + ((__x >> 2) & 0x3333333333333333));
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __fallback_popc64(uint64_t __x) {
  return __fallback_popc32(
      __x - ((__x >> 1) & 0x5555555555555555));
}

#ifndef _LIBCUDACXX_COMPILER_MSVC

inline _LIBCUDACXX_INLINE_VISIBILITY constexpr
int __libcpp_ctz(unsigned __x) noexcept {
#if defined(_LIBCUDACXX_COMPILER_NVRTC) \
 || (defined(_LIBCUDACXX_CUDACC_BELOW_11_3))
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)
  if (!__libcpp_is_constant_evaluated()) {
    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (
      return (!__x) ? sizeof(unsigned) * 8 : __ffs(__x) - 1;
    ), (
      return __builtin_ctz(__x);
    ))
  }
#endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)

    return __binary_ctz32(static_cast<uint64_t>(__x), 0);
#else // ^^^ _LIBCUDACXX_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3 vvv
    return __builtin_ctz(__x);
#endif // !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __libcpp_ctz(unsigned long __x) noexcept {
#if defined(_LIBCUDACXX_COMPILER_NVRTC) \
 || (defined(_LIBCUDACXX_CUDACC_BELOW_11_3))
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)
  if (!__libcpp_is_constant_evaluated()) {
    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (
      return (!__x) ? sizeof(unsigned long) * 8 : __ffsll(__x) - 1;
    ), (
      return __builtin_ctzl(__x);
    ))
  }
#endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)

    return __binary_ctz64(static_cast<uint64_t>(__x));
#else // ^^^ _LIBCUDACXX_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3 vvv
    return __builtin_ctzl(__x);
#endif // !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __libcpp_ctz(unsigned long long __x) noexcept {
// For whatever reason __builtin_ctzll does not compile although it should
#if 1 //def _LIBCUDACXX_COMPILER_NVRTC
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)
  if (!__libcpp_is_constant_evaluated()) {
    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (
      return (!__x) ? sizeof(unsigned long long) * 8 : __ffsll(__x) - 1;
    ), (
      return __builtin_ctzll(__x);
    ))
  }
#endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)

    return __binary_ctz64(static_cast<uint64_t>(__x));
#else // 0
    return __builtin_ctzll(__x);
#endif // !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __libcpp_clz(unsigned __x)           noexcept {
#if defined(_LIBCUDACXX_COMPILER_NVRTC) \
 || (defined(_LIBCUDACXX_CUDACC_BELOW_11_3))
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)
  if (!__libcpp_is_constant_evaluated()) {
    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (
      return __clz(__x);
    ), (
      return __builtin_clz(__x);
    ))
  }
#endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)
  return __binary_clz32(static_cast<uint64_t>(__x), 0);
#else // ^^^ _LIBCUDACXX_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3 vvv
  return __builtin_clz(__x);
#endif // !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __libcpp_clz(unsigned long __x)      noexcept {
#if defined(_LIBCUDACXX_COMPILER_NVRTC) \
 || (defined(_LIBCUDACXX_CUDACC_BELOW_11_3))
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)
  if (!__libcpp_is_constant_evaluated()) {
    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (
      return __clzll(__x);
    ), (
      return __builtin_clzl(__x);
    ))
  }
#endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)

  return __binary_clz64(static_cast<uint64_t>(__x));
#else // ^^^ _LIBCUDACXX_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3 vvv
  return __builtin_clzl(__x);
#endif // !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __libcpp_clz(unsigned long long __x) noexcept {
#if defined(_LIBCUDACXX_COMPILER_NVRTC) \
 || (defined(_LIBCUDACXX_CUDACC_BELOW_11_3))
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)
  if (!__libcpp_is_constant_evaluated()) {
    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (
      return __clzll(__x);
    ), (
      return __builtin_clzll(__x);
    ))
  }
#endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)

  return __binary_clz64(static_cast<uint64_t>(__x));
#else // ^^^ _LIBCUDACXX_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3 vvv
  return __builtin_clzll(__x);
#endif // !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __libcpp_popcount(unsigned __x)           noexcept {
#if defined(_LIBCUDACXX_COMPILER_NVRTC) \
 || (defined(_LIBCUDACXX_CUDACC_BELOW_11_3))
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)
  if (!__libcpp_is_constant_evaluated()) {
    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (
      return __popc(__x);
    ), (
      return __builtin_popcount(__x);
    ))
  }
#endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)

  return __fallback_popc64(static_cast<uint64_t>(__x));
#else // ^^^ _LIBCUDACXX_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3 vvv
  return __builtin_popcount(__x);
#endif // !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __libcpp_popcount(unsigned long __x)      noexcept {
#if defined(_LIBCUDACXX_COMPILER_NVRTC) \
 || (defined(_LIBCUDACXX_CUDACC_BELOW_11_3))
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)
  if (!__libcpp_is_constant_evaluated()) {
    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (
    return __popcll(__x);
    ), (
      return __builtin_popcountl(__x);
    ))
  }
#endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)

  return __fallback_popc64(static_cast<uint64_t>(__x));
#else // ^^^ _LIBCUDACXX_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3 vvv
  return __builtin_popcountl(__x);
#endif // !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3
}

inline _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __libcpp_popcount(unsigned long long __x) noexcept {
#if defined(_LIBCUDACXX_COMPILER_NVRTC) \
 || (defined(_LIBCUDACXX_CUDACC_BELOW_11_3))
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)
  if (!__libcpp_is_constant_evaluated()) {
    NV_IF_ELSE_TARGET(NV_IS_DEVICE, (
      return __popcll(__x);
    ), (
      return __builtin_popcountll(__x);
    ))
  }
#endif // defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && (_LIBCUDACXX_STD_VER >= 14)

  return __fallback_popc64(static_cast<uint64_t>(__x));
#else // ^^^ _LIBCUDACXX_COMPILER_NVRTC || nvcc < 11.3 ^^^ / vvv !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3 vvv
  return __builtin_popcountll(__x);
#endif // !_LIBCUDACXX_COMPILER_NVRTC || nvcc >= 11.3
}

#else  // _LIBCUDACXX_COMPILER_MSVC

// Precondition:  __x != 0
inline _LIBCUDACXX_INLINE_VISIBILITY
constexpr int __libcpp_ctz(unsigned __x) {
  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
  static_assert(sizeof(unsigned long) == 4, "");
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
  if (!__libcpp_is_constant_evaluated()) {
    unsigned long __where = 0;
    if (_BitScanForward(&__where, __x))
      return static_cast<int>(__where);
    return 32;
  }
#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)

  return __binary_ctz32(static_cast<uint64_t>(__x), 0);
}

inline _LIBCUDACXX_INLINE_VISIBILITY
constexpr int __libcpp_ctz(unsigned long __x) {
    static_assert(sizeof(unsigned long) == sizeof(unsigned), "");
    return __libcpp_ctz(static_cast<unsigned>(__x));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
constexpr int __libcpp_ctz(unsigned long long __x) {
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
  if (!__libcpp_is_constant_evaluated()) {
    unsigned long __where = 0;
#  if defined(_LIBCUDACXX_HAS_BITSCAN64) && (defined(_M_AMD64) || defined(__x86_64__))
    if (_BitScanForward64(&__where, __x))
      return static_cast<int>(__where);
#  else
    // Win32 doesn't have _BitScanForward64 so emulate it with two 32 bit calls.
    if (_BitScanForward(&__where, static_cast<unsigned long>(__x)))
      return static_cast<int>(__where);
    if (_BitScanForward(&__where, static_cast<unsigned long>(__x >> 32)))
      return static_cast<int>(__where + 32);
#  endif
    return 64;
  }
#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)

  return __binary_ctz64(__x);
}

// Precondition:  __x != 0
inline _LIBCUDACXX_INLINE_VISIBILITY
constexpr int __libcpp_clz(unsigned __x) {
  static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
  static_assert(sizeof(unsigned long) == 4, "");
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
  if (!__libcpp_is_constant_evaluated()) {
    unsigned long __where = 0;
    if (_BitScanReverse(&__where, __x))
      return static_cast<int>(31 - __where);
    return 32; // Undefined Behavior.
  }
#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)

  return __binary_clz32(static_cast<uint64_t>(__x), 0);
}

inline _LIBCUDACXX_INLINE_VISIBILITY
constexpr int __libcpp_clz(unsigned long __x) {
    static_assert(sizeof(unsigned) == sizeof(unsigned long), "");
    return __libcpp_clz(static_cast<unsigned>(__x));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
constexpr int __libcpp_clz(unsigned long long __x) {
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
  if (!__libcpp_is_constant_evaluated()) {
    unsigned long __where = 0;
#  if defined(_LIBCUDACXX_HAS_BITSCAN64)
    if (_BitScanReverse64(&__where, __x))
      return static_cast<int>(63 - __where);
#  else
    // Win32 doesn't have _BitScanReverse64 so emulate it with two 32 bit calls.
    if (_BitScanReverse(&__where, static_cast<unsigned long>(__x >> 32)))
      return static_cast<int>(63 - (__where + 32));
    if (_BitScanReverse(&__where, static_cast<unsigned long>(__x)))
      return static_cast<int>(63 - __where);
#  endif
    return 64; // Undefined Behavior.
  }
#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)

  return __binary_clz64(static_cast<uint64_t>(__x));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
constexpr int __libcpp_popcount(unsigned __x) {
  static_assert(sizeof(unsigned) == 4, "");
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
  if (!__libcpp_is_constant_evaluated()) {
    return static_cast<int>(__popcnt(__x));
  }
#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)

  return __fallback_popc64(static_cast<uint64_t>(__x));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
constexpr int __libcpp_popcount(unsigned long __x) {
  static_assert(sizeof(unsigned long) == 4, "");
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
  if (!__libcpp_is_constant_evaluated()) {
    return static_cast<int>(__popcnt(__x));
  }
#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__)

  return __fallback_popc64(static_cast<uint64_t>(__x));
}

inline _LIBCUDACXX_INLINE_VISIBILITY
constexpr int __libcpp_popcount(unsigned long long __x) {
  static_assert(sizeof(unsigned long long) == 8, "");
#if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED) && !defined(__CUDA_ARCH__)
  if (!__libcpp_is_constant_evaluated()) {
    return static_cast<int>(__popcnt64(__x));
  }
#endif // _LIBCUDACXX_IS_CONSTANT_EVALUATED && !defined(__CUDA_ARCH__

  return __fallback_popc64(static_cast<uint64_t>(__x));
}

#endif // _LIBCUDACXX_COMPILER_MSVC

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
_Tp __rotl(_Tp __t, unsigned int __cnt) noexcept
{
    static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires unsigned");
    using __nlt = numeric_limits<_Tp>;

    return ((__cnt % __nlt::digits) == 0) ?
      __t :
      (__t << (__cnt % __nlt::digits)) | (__t >> (__nlt::digits - (__cnt % __nlt::digits)));
}


template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
_Tp __rotr(_Tp __t, unsigned int __cnt) noexcept
{
    static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires unsigned");
    using __nlt = numeric_limits<_Tp>;

    return ((__cnt % __nlt::digits) == 0) ?
        __t :
        (__t >> (__cnt % __nlt::digits)) | (__t << (__nlt::digits - (__cnt % __nlt::digits)));
}

// Forward decl for recursive use in split word operations
template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __countr_zero(_Tp __t) noexcept;

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<sizeof(_Tp) <= sizeof(unsigned int), int> __countr_zero_dispatch(_Tp __t) noexcept
{
  return __libcpp_ctz(static_cast<unsigned int>(__t));
}

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<sizeof(_Tp) == sizeof(unsigned long long), int> __countr_zero_dispatch(_Tp __t) noexcept
{
  return __libcpp_ctz(static_cast<unsigned long long>(__t));
}

template <typename _Tp, int _St = sizeof(_Tp)/sizeof(unsigned long long)>
struct __countr_zero_rsh_impl {
    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
    int __short_circuit(_Tp __t, int __cur, int __count) {
      // Stops processing early if non-zero
      return (__cur == numeric_limits<unsigned long long>::digits) ?
            __countr_zero_rsh_impl<_Tp, _St-1>::__count(__t, __cur + __count) :
            __cur + __count;
    }

    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
    int __count(_Tp __t, int __count) {
        return __short_circuit(
            __t >> numeric_limits<unsigned long long>::digits,
            __countr_zero(static_cast<unsigned long long>(__t)),
            __count);
    }
};

template <typename _Tp>
struct __countr_zero_rsh_impl<_Tp, 1> {
    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
    int __count(_Tp __t, int __count) {
        return __count + __countr_zero(static_cast<unsigned long long>(__t));
    }
};

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<(sizeof(_Tp) > sizeof(unsigned long long)), int> __countr_zero_dispatch(_Tp __t) noexcept
{ return __countr_zero_rsh_impl<_Tp>::__count(__t, 0); }

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __countr_zero(_Tp __t) noexcept
{
    static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero requires unsigned");

    return __t ? __countr_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
}

// Forward decl for recursive use in split word operations
template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __countl_zero(_Tp __t) noexcept;

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<sizeof(_Tp) <= sizeof(unsigned int), int> __countl_zero_dispatch(_Tp __t) noexcept
{
  return __libcpp_clz(static_cast<unsigned int>(__t))
              - (numeric_limits<unsigned int>::digits - numeric_limits<_Tp>::digits);
}

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<sizeof(_Tp) == sizeof(unsigned long long), int> __countl_zero_dispatch(_Tp __t) noexcept
{
  return __libcpp_clz(static_cast<unsigned long long>(__t))
              - (numeric_limits<unsigned long long>::digits - numeric_limits<_Tp>::digits);
}

template <typename _Tp, int _St = sizeof(_Tp)/sizeof(unsigned long long)>
struct __countl_zero_rotl_impl {
    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
    int __short_circuit(_Tp __t, int __cur) {
      // This stops processing early if the current word is not empty
      return (__cur == numeric_limits<unsigned long long>::digits) ?
            __cur + __countl_zero_rotl_impl<_Tp, _St-1>::__count(__t) :
            __cur;
    }

    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
    int __countl_iter(_Tp __t) {
      // After rotating pass result of clz to another step for processing
      return __short_circuit(
            __t,
            __countl_zero(static_cast<unsigned long long>(__t)));
    }

    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
    int __count(_Tp __t) {
        return __countl_iter(
            __rotl(__t, numeric_limits<unsigned long long>::digits));
    }
};

template <typename _Tp>
struct __countl_zero_rotl_impl<_Tp, 1> {
    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
    int __count(_Tp __t) {
        return __countl_zero(static_cast<unsigned long long>(__rotl(__t, numeric_limits<unsigned long long>::digits)));
    }
};

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<(sizeof(_Tp) > sizeof(unsigned long long)), int> __countl_zero_dispatch(_Tp __t) noexcept
{ return __countl_zero_rotl_impl<_Tp>::__count(__t); }

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __countl_zero(_Tp __t) noexcept
{
    static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires unsigned");
    return __t ? __countl_zero_dispatch(__t) : numeric_limits<_Tp>::digits;
}

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __countl_one(_Tp __t) noexcept
{
    static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_one requires unsigned");
    return __t != numeric_limits<_Tp>::max()
        ? __countl_zero(static_cast<_Tp>(~__t))
        : numeric_limits<_Tp>::digits;
}


template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __countr_one(_Tp __t) noexcept
{
    static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_one requires unsigned");
    return __t != numeric_limits<_Tp>::max()
        ? __countr_zero(static_cast<_Tp>(~__t))
        : numeric_limits<_Tp>::digits;
}

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<sizeof(_Tp) <= sizeof(unsigned int), int> __popcount_dispatch(_Tp __t) noexcept
{ return __libcpp_popcount(static_cast<unsigned int>(__t)); }

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<sizeof(_Tp) == sizeof(unsigned long long), int> __popcount_dispatch(_Tp __t) noexcept
{ return __libcpp_popcount(static_cast<unsigned long long>(__t)); }

template <typename _Tp, int _St = sizeof(_Tp)/sizeof(unsigned long long)>
struct __popcount_rsh_impl {
    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
    int __count(_Tp __t) {
        return __popcount_rsh_impl<_Tp, _St-1>::__count(
                    __t >> numeric_limits<unsigned long long>::digits) +
               __libcpp_popcount(static_cast<unsigned long long>(__t));
    }
};

template <typename _Tp>
struct __popcount_rsh_impl<_Tp, 1> {
    static _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
    int __count(_Tp __t) {
        return __libcpp_popcount(static_cast<unsigned long long>(__t));
    }
};

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<(sizeof(_Tp) > sizeof(unsigned long long)), int> __popcount_dispatch(_Tp __t) noexcept
{ return __popcount_rsh_impl<_Tp>::__count(__t); }

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
int __popcount(_Tp __t) noexcept
{
    static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__libcpp_popcount requires unsigned");

    return __popcount_dispatch(__t);
}


// integral log base 2
template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
unsigned __bit_log2(_Tp __t) noexcept
{
    static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires unsigned");
    return std::numeric_limits<_Tp>::digits - 1 - __countl_zero(__t);
}

template <class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
bool __has_single_bit(_Tp __t) noexcept
{
    static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__has_single_bit requires unsigned");
	return __t != 0 && (((__t & (__t - 1)) == 0));
}

template <class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<sizeof(_Tp) >= sizeof(unsigned), _Tp>
__ceil2(_Tp __t) noexcept
{
    // const unsigned __n = numeric_limits<_Tp>::digits - countl_zero((_Tp)(__t - 1u));
    // _LIBCUDACXX_DEBUG_ASSERT(__libcpp_is_constant_evaluated() || __n != numeric_limits<_Tp>::digits, "Bad input to ceil2");
    return _Tp{1} << (numeric_limits<_Tp>::digits - __countl_zero((_Tp)(__t - 1u)));
}

template <class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<sizeof(_Tp) < sizeof(unsigned), _Tp>
__ceil2(_Tp __t) noexcept
{
    // const unsigned __n = numeric_limits<_Tp>::digits - countl_zero((_Tp)(__t - 1u));
    // _LIBCUDACXX_DEBUG_ASSERT(__libcpp_is_constant_evaluated() || __n != numeric_limits<_Tp>::digits, "Bad input to ceil2");

    // const unsigned __extra = numeric_limits<unsigned>::digits  - numeric_limits<_Tp>::digits;
    // const unsigned __retVal = 1u << (__n + __extra);
    return (_Tp)
        ((1u << ((numeric_limits<_Tp>::digits - __countl_zero((_Tp)(__t - 1u))) + (numeric_limits<unsigned>::digits  - numeric_limits<_Tp>::digits))) >>
        (numeric_limits<unsigned>::digits  - numeric_limits<_Tp>::digits));
}


#if (_LIBCUDACXX_STD_VER > 17) || defined(__cuda_std__)

template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
rotl(_Tp __t, unsigned int __cnt) noexcept
{
    return __rotl(__t, __cnt);
}


// rotr
template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
rotr(_Tp __t, unsigned int __cnt) noexcept
{
    return __rotr(__t, __cnt);
}


template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
countl_zero(_Tp __t) noexcept
{
    return __countl_zero(__t);
}


template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
countl_one(_Tp __t) noexcept
{
    return __countl_one(__t);
}


template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
countr_zero(_Tp __t) noexcept
{
	return __countr_zero(__t);
}


template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
countr_one(_Tp __t) noexcept
{
    return __countr_one(__t);
}


template<class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, int>
popcount(_Tp __t) noexcept
{
    return __popcount(__t);
}


template <class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY constexpr
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, bool>
has_single_bit(_Tp __t) noexcept
{
    return __has_single_bit(__t);
}

template <class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
bit_floor(_Tp __t) noexcept
{
    return __t == 0 ? 0 : static_cast<_Tp>(_Tp{1} << __bit_log2(__t));
}

template <class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
bit_ceil(_Tp __t) noexcept
{
    return (__t < 2) ? 1 : static_cast<_Tp>(__ceil2(__t));
}

template <class _Tp>
_LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_BIT_CONSTEXPR
__enable_if_t<__libcpp_is_unsigned_integer<_Tp>::value, _Tp>
bit_width(_Tp __t) noexcept
{
    return __t == 0 ? 0 : static_cast<_Tp>(__bit_log2(__t) + 1);
}


enum class endian
{
    little = 0xDEAD,
    big    = 0xFACE,
#if defined(_LIBCUDACXX_LITTLE_ENDIAN)
    native = little
#elif defined(_LIBCUDACXX_BIG_ENDIAN)
    native = big
#else
    native = 0xCAFE
#endif
};

#endif // _LIBCUDACXX_STD_VER > 17

_LIBCUDACXX_END_NAMESPACE_STD

#ifndef __cuda_std__
#include <__pragma_pop>
#endif // __cuda_std__

#endif // _LIBCUDACXX_BIT
