| # | Time | Username | Problem | Language | Result | Execution time | Memory |
|---|---|---|---|---|---|---|---|
| 1303012 | liangjeremy | Cubeword (CEOI19_cubeword) | C++20 | 0 ms | 0 KiB |
#pragma GCC optimize("O3,unroll-loops")
#pragma GCC target("avx2,bmi,bmi2,lzcnt,popcnt")
#include<bits/stdc++.h>
using namespace std;
// Keep the macro for your logic, but we will use int32_t for performance critical parts
#define int long long
// Standard definitions
using ll=int64_t;
// Optimized Modular Integer
// Removed internal long long storage to save cache bandwidth
template<int mod>
struct mint {
int32_t v; // Use 32-bit integer for storage
mint():v(0){}
mint(ll _v):v((int32_t)(_v%mod)){ if(v<0) v+=mod; }
mint& operator+=(const mint& o){
if((v+=o.v)>=mod)v-=mod;
return *this;
}
mint& operator-=(const mint& o){
if((v-=o.v)<0)v+=mod;
return *this;
}
mint& operator*=(const mint& o){
v = (int32_t)((ll)v*o.v%mod); // Cast to long long only for multiplication
return *this;
}
friend mint operator+(mint a, const mint& b){ return a+=b; }
friend mint operator*(mint a, const mint& b){ return a*=b; }
};
using mi = mint<998244353>;
// GLOBAL STATIC ARRAYS
// Replacing vectors with fixed size arrays for contiguous memory
// 205 is sufficient based on your code (vis is size 200)
const int MAXT = 205;
mi dp1[11][MAXT][MAXT];
mi dp2[MAXT][MAXT][MAXT];
int32_t main(){
ios::sync_with_stdio(0); cin.tie(0); cout.tie(0);
int n;
if(!(cin >> n)) return 0;
vector<string> s(2*n);
vector<int> vis(256, 0); // Increased slightly for safety
for(int i=0; i<n; i++){
cin >> s[i];
s[i+n] = s[i];
reverse(s[i+n].begin(), s[i+n].end());
for(auto x:s[i]){ vis[x]=1; }
}
sort(s.begin(), s.end());
s.erase(unique(s.begin(), s.end()), s.end());
n = s.size();
int timer = 0;
vector<int> pos(256);
for(int i=0; i<256; i++){
if(vis[i]){
timer++; pos[i]=timer;
}
}
// Fill dp1
for(int i=0; i<n; i++){
if(s[i].size() <= 10) {
dp1[s[i].size()][pos[s[i][0]]][pos[s[i].back()]] += 1;
}
}
mi ans = 0;
// Main DP Logic
for(int len=3; len<=10; len++){
// Reset dp2 for this length
// Since it's global, we must manually clear it or overwrite it
// Note: Using int32_t for loops is faster
for(int32_t i=1; i<=timer; i++){
for(int32_t j=1; j<=timer; j++){
for(int32_t k=1; k<=timer; k++){
mi val = 0;
// Inner loop logic optimized:
// dp1 access is linear over x for the 2nd and 3rd term if we organized it differently,
// but static array is fast enough here.
for(int32_t x=1; x<=timer; x++){
val += dp1[len][x][i] * dp1[len][x][j] * dp1[len][x][k];
}
dp2[i][j][k] = val;
}
}
}
// CACHE OPTIMIZATION: Moved 'x' to the OUTERMOST loop
// Original was i,j,k,x. Accessing dp2[x][...] inside inner loop jumps huge memory addresses.
// With x outer, dp2[x][i][k] becomes constant for the inner loops over j.
for(int32_t x=1; x<=timer; x++){
for(int32_t i=1; i<=timer; i++){
for(int32_t k=1; k<=timer; k++){
// Precompute/fetch constant parts for the inner j loop
mi term_xk = dp2[x][i][k];
for(int32_t j=1; j<=timer; j++){
// dp2[i][j][k] is standard access
// dp2[x][i][k] is constant in this loop
// dp2[x][i][j] is linear access
// dp2[x][j][k] is scattered, but better than before
ans += dp2[i][j][k] * term_xk * dp2[x][i][j] * dp2[x][j][k];
}
}
}
}
}
cout << (int)ans.v << '\n';
}
