Suffix Array and LCP Array
For an array \(A\) of length \(n\), sa_lcp(A)
returns two vectors \(SA\) and \(LCP\) where,
- \(A[SA[i] \dots]\) is the \(i\)-th suffix in lexicographical order for every \(i \in \left[0, n\right)\)
and
- \(LCP[i]\) is the length of the longest common prefix between \(A[SA[i-1] \dots]\) and \(A[SA[i] \dots]\) for every \(i \in \left[1, n \right)\). Also, \(LCP[0] = 0\).
\(SA\) and \(LCP\) are called "suffix array" and "LCP array" of \(A\) respectively.
For finding SA, Manber-Myers algorithm combined with counting sort is used, hence the time complexity is \(O(n\log{n})\). For LCP array, Kasai's algorithm is used, hence the time complexity is \(O(n)\). The total time complexity is \(O(n\log{n})\).
Example
fn main() { let s = "asdsdasd"; let (sa, lcp) = sa_lcp(s.as_bytes()); println!("{:?}", sa); // [5, 0, 7, 4, 2, 6, 3, 1] println!("{:?}", lcp); // [x, 3, 0, 1, 1, 0, 2, 2] } // Suffix array and LCP array // Reference: http://www.secmem.org/blog/2021/07/18/suffix-array-and-lcp/ fn suffix_array<T: Ord>(s: &[T]) -> Vec<usize> { use std::collections::*; if s.len() == 0 { return vec![]; } else if s.len() == 1 { return vec![0]; } let n = s.len(); let mut r: Vec<usize> = vec![0; n * 2]; let map: BTreeMap<_, _> = { let mut sorted: Vec<_> = s.iter().collect(); sorted.sort_unstable(); sorted .into_iter() .enumerate() .map(|x| (x.1, x.0 + 1)) .collect() }; for i in 0..n { r[i] = *map.get(&s[i]).unwrap(); } let m = n.max(map.len()) + 1; let mut sa: Vec<usize> = (0..n).collect(); let mut nr: Vec<usize> = vec![0; n * 2]; let mut cnt: Vec<usize> = vec![0; m]; let mut idx: Vec<usize> = vec![0; n]; for d in (0..).map(|x| 1 << x).take_while(|&d| d < n) { macro_rules! key { ($i:expr) => { if $i + d >= n { (r[$i], 0) } else { (r[$i], r[$i + d]) } }; } (0..m).for_each(|i| cnt[i] = 0); (0..n).for_each(|i| cnt[r[i + d]] += 1); (1..m).for_each(|i| cnt[i] += cnt[i - 1]); for i in (0..n).rev() { cnt[r[i + d]] -= 1; idx[cnt[r[i + d]]] = i; } (0..m).for_each(|i| cnt[i] = 0); (0..n).for_each(|i| cnt[r[i]] += 1); (1..m).for_each(|i| cnt[i] += cnt[i - 1]); for i in (0..n).rev() { cnt[r[idx[i]]] -= 1; sa[cnt[r[idx[i]]]] = idx[i]; } nr[sa[0]] = 1; for i in 1..n { nr[sa[i]] = nr[sa[i - 1]] + if key!(sa[i - 1]) < key!(sa[i]) { 1 } else { 0 }; } std::mem::swap(&mut r, &mut nr); if r[sa[n - 1]] == n { break; } } sa } fn sa_lcp<T: Ord>(arr: &[T]) -> (Vec<usize>, Vec<usize>) { let n = arr.len(); let sa = suffix_array(arr); let mut lcp: Vec<usize> = vec![0; n]; let mut isa: Vec<usize> = vec![0; n]; for i in 0..n { isa[sa[i]] = i; } let mut k = 0; for i in 0..n { if isa[i] != 0 { let j = sa[isa[i] - 1]; while i + k < n && j + k < n && arr[i + k] == arr[j + k] { k += 1; } lcp[isa[i]] = if k != 0 { k -= 1; k + 1 } else { 0 }; } } (sa, lcp) }
Code
// Suffix array and LCP array
// Reference: http://www.secmem.org/blog/2021/07/18/suffix-array-and-lcp/
fn suffix_array<T: Ord>(s: &[T]) -> Vec<usize> {
use std::collections::*;
if s.len() == 0 {
return vec![];
} else if s.len() == 1 {
return vec![0];
}
let n = s.len();
let mut r: Vec<usize> = vec![0; n * 2];
let map: BTreeMap<_, _> = {
let mut sorted: Vec<_> = s.iter().collect();
sorted.sort_unstable();
sorted
.into_iter()
.enumerate()
.map(|x| (x.1, x.0 + 1))
.collect()
};
for i in 0..n {
r[i] = *map.get(&s[i]).unwrap();
}
let m = n.max(map.len()) + 1;
let mut sa: Vec<usize> = (0..n).collect();
let mut nr: Vec<usize> = vec![0; n * 2];
let mut cnt: Vec<usize> = vec![0; m];
let mut idx: Vec<usize> = vec![0; n];
for d in (0..).map(|x| 1 << x).take_while(|&d| d < n) {
macro_rules! key {
($i:expr) => {
if $i + d >= n {
(r[$i], 0)
} else {
(r[$i], r[$i + d])
}
};
}
(0..m).for_each(|i| cnt[i] = 0);
(0..n).for_each(|i| cnt[r[i + d]] += 1);
(1..m).for_each(|i| cnt[i] += cnt[i - 1]);
for i in (0..n).rev() {
cnt[r[i + d]] -= 1;
idx[cnt[r[i + d]]] = i;
}
(0..m).for_each(|i| cnt[i] = 0);
(0..n).for_each(|i| cnt[r[i]] += 1);
(1..m).for_each(|i| cnt[i] += cnt[i - 1]);
for i in (0..n).rev() {
cnt[r[idx[i]]] -= 1;
sa[cnt[r[idx[i]]]] = idx[i];
}
nr[sa[0]] = 1;
for i in 1..n {
nr[sa[i]] = nr[sa[i - 1]] + if key!(sa[i - 1]) < key!(sa[i]) { 1 } else { 0 };
}
std::mem::swap(&mut r, &mut nr);
if r[sa[n - 1]] == n {
break;
}
}
sa
}
fn sa_lcp<T: Ord>(arr: &[T]) -> (Vec<usize>, Vec<usize>) {
let n = arr.len();
let sa = suffix_array(arr);
let mut lcp: Vec<usize> = vec![0; n];
let mut isa: Vec<usize> = vec![0; n];
for i in 0..n {
isa[sa[i]] = i;
}
let mut k = 0;
for i in 0..n {
if isa[i] != 0 {
let j = sa[isa[i] - 1];
while i + k < n && j + k < n && arr[i + k] == arr[j + k] {
k += 1;
}
lcp[isa[i]] = if k != 0 {
k -= 1;
k + 1
} else {
0
};
}
}
(sa, lcp)
}