Sunday, November 13, 2016

Minimum Genetic Mutation -- LeetCode

[Question]
A gene string can be represented by an 8-character long string, with choices from "A""C""G""T".
Suppose we need to investigate about a mutation (mutation from "start" to "end"), where ONE mutation is defined as ONE single character changed in the gene string.
For example, "AACCGGTT" -> "AACCGGTA" is 1 mutation.
Also, there is a given gene "bank", which records all the valid gene mutations. A gene must be in the bank to make it a valid gene string.
Now, given 3 things - start, end, bank, your task is to determine what is the minimum number of mutations needed to mutate from "start" to "end". If there is no such a mutation, return -1.
Note:
  1. Starting point is assumed to be valid, so it might not be included in the bank.
  2. If multiple mutations are needed, all mutations during in the sequence must be valid.
  3. You may assume start and end string is not the same.
[Analysis]
The gene strings can construct a graph with each node is an gene and each edge is a valid mutation. Then the problem becomes find a path from start node to the end node. This can be done with BFS and time complexity is O(N).

The construction of the graph needs to compare every two nodes, which makes the time complexity O(N^2). Since the gene string is an 8-character string with only 4 letters, instead of constructing a graph, we can use all valid mutation of a given string when probing the next possible gene. There only 31(4x8-1) possibilities. Therefore, the overall time complexity can be still O(N).

The sample code #1 constructed a graph and code #2 just enumerated the 31 possible mutations.

[Solution]
//-- code #1 with building a graph --
class Solution {
    bool oneMutation( const string& a, const string& b ) {
        if (a.size()!=8 && a.size()!=b.size()) return false;
     
        int count=0;
        for (int i=0; i<8; i++) {
            count += (a[i]!=b[i]);
        }
        return (count==1);
    }
 
public:
    int minMutation(string start, string end, vector<string>& bank) {
        int s=bank.size(),e=-1;
     
        for (int i=0; i<bank.size(); i++) {
            if (bank[i].compare(start)==0) s=i;
            if (bank[i].compare(end)==0) e=i;
        }
        if (e==-1) return -1;
        if (s==bank.size()) bank.push_back(start);
     
        vector<vector<int>> grph(bank.size(), vector<int>() );
        for (int i=0; i<bank.size(); i++) {
            for (int j=i+1; j<bank.size(); j++) {
                if (oneMutation(bank[i], bank[j]) ) {
                    grph[i].push_back(j);
                    grph[j].push_back(i);
                }
            }
        }
     
        unordered_set<int> visited;
        queue<int> que; int step=1;
        que.push(s);  que.push(INT_MAX);
        while (!que.empty()) {
            int cur = que.front();
            que.pop();
         
            if (cur==INT_MAX) {
                step++;
                if (que.empty()) break;
                else {
                    que.push(INT_MAX);
                    continue;
                }
            }
            visited.insert(cur);
            for(auto n: grph[cur]) {
                if (n==e) return step;
                if (visited.count(n)==0) que.push(n);
            }
        }
        return -1;
    }

};

//-- Code #2  without graph, using enumeration --
class Solution {
    int to_int(string gene) {
        static unordered_map<char, int> gmap ({{'A',0},{'C',1},{'G',2}, {'T',3}});
        int res=0;
        for(int i=0; i<8; i++) {
            res = res<<2 | gmap[ gene[i] ];
        }
        return res;
    }
 
public:
    int minMutation(string start, string end, vector<string>& bank) {
        unordered_set<int> gbank;
        for (int i=0; i<bank.size(); i++) {
            gbank.insert(to_int(bank[i]));
        }
        if ( gbank.count(to_int(end)) == 0 ) return -1;
     
        queue<int> que;
        que.push(to_int(start));
        que.push(INT_MAX);
        int step=1;
        int e=to_int(end);
        while (!que.empty()) {
            int cur = que.front();
            que.pop();

            if (cur==INT_MAX && que.empty()) break;
            if (cur==INT_MAX) {
                step++;
                que.push(INT_MAX);
                continue;
            }
         
            for (int i=0; i<8; i++) {
                for (int j=0; j<4; j++) {
                    int next = cur  ^ (j << 2*i);
                    if (gbank.count( next ) != 0) {
                        if (next==e) return step;
                        que.push( next );
                        gbank.erase( next );
                    }
                }
            }
        }
        return -1;
    }
};

No comments:

Post a Comment