Constructing result for given word's order and overlapping in Shortest Superstring Problem

106 Views Asked by At

For over 3 days I'm trying to solve Shortest Superstring problem and I still can't figure out how can I create resulting string when I know in what order should I use given words and how these words overlaps. I have read many solutions but I really don't understand them indeed.

That's my code:

#include <iostream>
#include <vector>

using namespace std;

struct Node
{
    int cost;
    int next_mask;
    int next_node;

    Node(int cost, int mask, int next_node) : cost(cost), next_mask(mask), next_node(next_node)
    {
    }

    Node() : cost(-1), next_mask(-1), next_node(-1)
    {
    }

    bool is_done() const {
        return cost != -1;
    }
};

// Cost = how many extra characters do I need to fully overlap w2. The characters from w1 are not extra
pair<int, string> get_cost(string const& w1, string const& w2) {
    for (int i = 0; i < w1.size(); ++i) {
        string sub = w1.substr(i);
        int overlaps_start = w2.find(sub);
        if (overlaps_start != 0)
            continue;

        int overlapping_count = overlaps_start + sub.size();
        return { w2.size() - overlapping_count, sub };
    }

    return { w2.size(), "" };
}

void solve(vector<vector<int>> const& graph, int current, int mask, vector<vector<Node>>& dp) {
    if (mask == (1 << graph.size()) - 1 || dp[mask][current].is_done())
        return;

    for (int node = 0; node < graph.size(); ++node) {
        int check_state_mask = 1 << node;
        if (mask & check_state_mask)
            continue;

        int node_mask = mask | check_state_mask;
        solve(graph, node, node_mask, dp);
        int sub = graph[current][node] + dp[node_mask][node].cost;

        if (!dp[mask][current].is_done() || sub < dp[mask][current].cost) {
            dp[mask][current].cost = sub;
            dp[mask][current].next_mask = node_mask;
            dp[mask][current].next_node = node;
        }
    }
}

string shortestSuperstring(vector<string>& words) {
    vector<vector<int>> graph(words.size(), vector<int>(words.size()));
    vector<vector<string>> common_parts(words.size(), vector<string>(words.size()));
    for (int i = 0; i < words.size(); ++i) {
        for (int j = 0; j < words.size(); ++j) {
            pair<int, string> cost1 = get_cost(words[i], words[j]);
            pair<int, string> cost2 = get_cost(words[j], words[i]);

            graph[i][j] = cost1.first;
            graph[j][i] = cost2.first;

            common_parts[i][j] = cost1.second;
            common_parts[j][i] = cost2.second;
        }
    }

    // dp also tracks the path
    vector<vector<Node>> dp(1 << graph.size(), vector<Node>(graph.size(), Node()));
    solve(graph, 0, 1, dp);

    // We start from node 0 with mask = 1, so the result is at dp[1][0]
    Node parent = dp[1][0];
    int parent_node = 0;
    while (parent.next_node != -1) {
        // What should happen here to build a result???
        cout << parent_node << " : " << parent.next_node << " Common: " << common_parts[parent_node][parent.next_node] << "\n";

        parent_node = parent.next_node;
        parent = dp[parent.next_mask][parent.next_node];
    }

    return "im-too-weak-to-solve-it";
}

int main()
{
    vector<string> v{ "catg","ctaagt","gcta","ttca","atgcatc" };
    std::cout << shortestSuperstring(v);
}

If you run this code, you will see that for some nodes they may not be common string, and I think that's my problem. When I know where two strings overlaps, I can do some substrings and get the result. But when some strings don't have common part, then I don't know where to insert them, and I can't omit this one because the next one will depend on it and so on.

Theoretically, I could find the best position for it (most overlapping) by looking at current result, but how can I be sure that it won't destroy the result? If I put it on the end or on the beginning, then how can I be sure that there is no better solution?

So how can I construct the result if I know the word's order and how they overlaps?

0

There are 0 best solutions below