For over 3 days I'm trying to solve Shortest Superstring problem and I still can't figure out how can I create resulting string when I know in what order should I use given words and how these words overlaps. I have read many solutions but I really don't understand them indeed.
That's my code:
#include <iostream>
#include <vector>
using namespace std;
struct Node
{
int cost;
int next_mask;
int next_node;
Node(int cost, int mask, int next_node) : cost(cost), next_mask(mask), next_node(next_node)
{
}
Node() : cost(-1), next_mask(-1), next_node(-1)
{
}
bool is_done() const {
return cost != -1;
}
};
// Cost = how many extra characters do I need to fully overlap w2. The characters from w1 are not extra
pair<int, string> get_cost(string const& w1, string const& w2) {
for (int i = 0; i < w1.size(); ++i) {
string sub = w1.substr(i);
int overlaps_start = w2.find(sub);
if (overlaps_start != 0)
continue;
int overlapping_count = overlaps_start + sub.size();
return { w2.size() - overlapping_count, sub };
}
return { w2.size(), "" };
}
void solve(vector<vector<int>> const& graph, int current, int mask, vector<vector<Node>>& dp) {
if (mask == (1 << graph.size()) - 1 || dp[mask][current].is_done())
return;
for (int node = 0; node < graph.size(); ++node) {
int check_state_mask = 1 << node;
if (mask & check_state_mask)
continue;
int node_mask = mask | check_state_mask;
solve(graph, node, node_mask, dp);
int sub = graph[current][node] + dp[node_mask][node].cost;
if (!dp[mask][current].is_done() || sub < dp[mask][current].cost) {
dp[mask][current].cost = sub;
dp[mask][current].next_mask = node_mask;
dp[mask][current].next_node = node;
}
}
}
string shortestSuperstring(vector<string>& words) {
vector<vector<int>> graph(words.size(), vector<int>(words.size()));
vector<vector<string>> common_parts(words.size(), vector<string>(words.size()));
for (int i = 0; i < words.size(); ++i) {
for (int j = 0; j < words.size(); ++j) {
pair<int, string> cost1 = get_cost(words[i], words[j]);
pair<int, string> cost2 = get_cost(words[j], words[i]);
graph[i][j] = cost1.first;
graph[j][i] = cost2.first;
common_parts[i][j] = cost1.second;
common_parts[j][i] = cost2.second;
}
}
// dp also tracks the path
vector<vector<Node>> dp(1 << graph.size(), vector<Node>(graph.size(), Node()));
solve(graph, 0, 1, dp);
// We start from node 0 with mask = 1, so the result is at dp[1][0]
Node parent = dp[1][0];
int parent_node = 0;
while (parent.next_node != -1) {
// What should happen here to build a result???
cout << parent_node << " : " << parent.next_node << " Common: " << common_parts[parent_node][parent.next_node] << "\n";
parent_node = parent.next_node;
parent = dp[parent.next_mask][parent.next_node];
}
return "im-too-weak-to-solve-it";
}
int main()
{
vector<string> v{ "catg","ctaagt","gcta","ttca","atgcatc" };
std::cout << shortestSuperstring(v);
}
If you run this code, you will see that for some nodes they may not be common string, and I think that's my problem. When I know where two strings overlaps, I can do some substrings and get the result. But when some strings don't have common part, then I don't know where to insert them, and I can't omit this one because the next one will depend on it and so on.
Theoretically, I could find the best position for it (most overlapping) by looking at current result, but how can I be sure that it won't destroy the result? If I put it on the end or on the beginning, then how can I be sure that there is no better solution?
So how can I construct the result if I know the word's order and how they overlaps?