CF438D 题解

FurippuWRY · 2023-11-05 17:02:04 · 题解

鲜花：~~看这美丽的 4 秒时限，它在告诉我们这题用暴力做。~~

前置知识：巴雷特模乘、循环展开。

巴雷特模乘就是比一般的取模更快，用公式表示就是：

a \bmod b=a-(\dfrac{ay}{2^{64}})\times b

其中 y=\dfrac{2^{64}}{b}。

由于 unsigned long long 的最大值为 2^{64}-1，所以要用 __int128。

注意 2^{64} 在代码里要用 1 << 64 来表示。

用代码表示为：

long long mod(long long a, long long b) {
    __int128 y = ((__int128)1 << 64) / b;
    return a - ((__int128)a * y >> 64) * b;
}

用这种方法能减少取模所需的时间，在这题里能减少上百毫秒。

如果模数大于被模数，就不用巴雷特了。

循环展开是一种牺牲程序的尺寸来加快程序的执行速度的优化方法，把循环次数减少，然后把多个操作放在一次循环里。举个例子，比如我们要求区间 [1,10^9] 中的所有数的和，一般写法如下：

long long ans = 0, MAX = 1e9;
for (long long i = 1; i <= MAX; ++i) {
    ans += i;
}

循环展开的写法如下：

long long ans = 0, MAX = 1e18;
for (long long i = 1; i <= MAX; i += 5) {
    ans += i;
    ans += (i + 1);
    ans += (i + 2);
    ans += (i + 3);
    ans += (i + 4);
}

经过数次运行测试，一般写法的代码用时在 [1.905,2.145] 秒之间，循环展开的代码用时在 [1.28,1.289] 秒之间，可见循环展开的优化之大。

总之做法就是，在第二个操作里加上巴雷特模乘和循环展开，然后打个快读快写和 Ofast，别的就是纯暴力了。

注意变量不要打太多，会拉时间。

#include <bits/stdc++.h>
#pragma GCC optimize("Ofast")

using namespace std;

typedef long long ll;
typedef __int128 i128;

const ll N = 1e5 + 9;

//快读快写
char buf[1 << 21], *p1, *p2;
#define getchar() (p1==p2&&(p2=(p1=buf)+fread(buf,1,1<<21,stdin),p1==p2)?EOF:*p1++)
inline ll read() {ll x = 0, f = 1;char ch = getchar();while (ch < '0' || ch > '9') {if (ch == '-')f = -1;ch = getchar();}while (ch >= '0' && ch <= '9') x = (x << 3) + (x << 1) + ch - '0', ch = getchar();return x * f;}
inline void print(ll x, char ch = 0) {if (x < 0) putchar('-'), print(-x);else {if (x >= 10) print(x / 10);putchar(x % 10 + '0');}if (ch) putchar(ch);return;}

int n, m, l, r, x, opt;
ll a[N], ans;

int main() {

    n = read();
    m = read();
    for (ll i = 1; i <= n; ++i) {
        a[i] = read();
    }
    while (m--) {
        ans = 0;
        opt = read();
        l = read();
        r = read();
        if (opt == 1) {
            for (ll i = l; i <= r; ++i) {
                ans += a[i];
            }
            print(ans, '\n');
        }
        else
        if (opt == 2) {
            x = read();
            i128 y = ((i128)1 << 64) / x;
            ll i;
         //循环展开
            for (i = l; i + 4 <= r; i += 5) {
                //巴雷特模乘
                if (a[i] >= x) {
                    a[i] = a[i] - ((i128)a[i] * y >> 64) * x;
                    if (a[i] >= x) a[i] -= x;
                }
                if (a[i + 1] >= x)
                {
                    a[i + 1] = a[i + 1] - ((i128) a[i + 1] * y >> 64) * x;
                    if (a[i + 1] >= x) a[i + 1] -= x;
                }
                if (a[i + 2] >= x)
                {
                    a[i + 2] = a[i + 2] - ((i128) a[i + 2] * y >> 64) * x;
                    if (a[i + 2] >= x) a[i + 2] -= x;
                }
                if (a[i + 3] >= x)
                {
                    a[i + 3] = a[i + 3] - ((i128) a[i + 3] * y >> 64) * x;
                    if (a[i + 3] >= x) a[i + 3] -= x;
                }
                if (a[i + 4] >= x)
                {
                    a[i + 4] = a[i + 4] - ((i128) a[i + 4] * y >> 64) * x;
                    if (a[i + 4] >= x) a[i + 4] -= x;
                }
            }
            while (i <= r) {
                if (a[i] >= x) {
                    a[i] = a[i] - ((i128)a[i] * y >> 64) * x;
                    if (a[i] >= x) a[i] -= x;
                }
                ++i;
            }
        }
        else {
            a[l] = r;
        }
    }
    return 0;
}

此段代码用时 3432ms，完全暴力代码超时。

感谢 @double_wings 的指导。