1: class Solution {
2: public:
3: vector<string> findRepeatedDnaSequences(string s) {
4: unordered_map<string, int> map;
5: vector<string> res;
6: if (s.size() < 10) return res;
7: for (int i = 0; i <= s.size()-10; i++) {
8: string ss = s.substr(i, 10);
9: if (map.find(ss) == map.end()) map[ss] = 1;
10: else if (map[ss] == 1) {
11: res.push_back(ss);
12: map[ss]++;
13: };
14: }
15: return res;
16: }
17: };
This can pass but the performance is not very good (beat 31.4%). Let's look at the hex number of ASCII code for 'A' (0x41), 'C' (0x43), 'G' (0x47) and 'T' (0x54). It's easy to see the last 3 bits for these four letters are different (subtract letters by multiple of 16), where A is 001, C is 010, G is 111 and T is 110. Considering the string length is only 10 letters long, so the hash key can be represented by an integer which has 32 bits in total.
1: class Solution { 2: public: 3: vector<string> findRepeatedDnaSequences(string s) { 4: unordered_map<int, int> map; 5: vector<string> res; 6: if (s.size() < 10) return res; 7: int key = 0, i = 0; 8: while (
i < 9
) { 9:
key = key << 3 | s[i++] & 0x7;
10: } 11: while (i < s.size()) { 12:
key = key << 3 & 0x3FFFFFFF | s[i++] & 0x7;
13: if (map[key]++ == 1) { 14: res.push_back(s.substr(i-10, 10)); 15: } 16: } 17: return res; 18: } 19: };
No comments:
Post a Comment