function ans = p_trie(s,t,p)
%P_TRIE
% -Finds the contiguous subsequence match count between strings s and t
% by using a depth-first traversal on a retrieval trie,
% where the length of the subsequence is p.
% *(While the programs p_spectrum, p_spectrum_bf, p_spectrum_fast and
% and p_trie all do the same thing, p_trie is significantly faster than
% the other three programs.)
%
% -The following algorithm is used:
% A root node for a retrieval trie is initially made, which stores all k-mers of s,t
% of length p. Each leaf of the tree is a substring of the root k-mer. A leaf at
% depth i corresponds to all substrings which match up to position i, where 1 <= i <= p.
% The algorithm recursively goes through all leafs, so for example, if p = 2, it would go
% to i=1, and then every character in the alphabet, so initially a, and then recurse
% to i = 2, which leads to aa -> ab -> ... -> az. It would then finish this level of
% recursion and go back to i = 1, proceed to b, and then to i = 2, going to ba -> bb -> ... etc.,
% all the way to bz. At each maximum depth (i.e. p-value), the substrings present in the leaf
% are calculated and added to the current kernel count. Keep in mind that the whole tree is never
% stored in memory, just the leaves currently needed at the time.
% More explicit pseudo-code of this algorithm is given in the help file for the function traverse().
%
%
% -Example: p_trie('abccc','abc', 3) returns a value of 1.
% (Note that p_trie('abccc','abc',3)=p_trie('abc','abccc',3) since K(s,t,p) = K(t,s,p) ).
% -Example: p_trie('a','a', 1) returns a value of 1.
% -Example: p_trie('a','b', 1) returns a value of 0.
% -Example: p_trie('ab','ab', 2) returns a value of 1.
%
%
%USAGE: scalar = p_trie('string1','string2', p); (where p is the length of the substring)
%
%For more information, visit http://www.kernel-methods.net/
%Written and tested in Matlab 6.0, Release 12.
%Copyright 2003, Manju M. Pai 5/2003
%manju@kernel-methods.net
%------------------------------------------------------------------------------------------
%Obtain lengths of strings
[num_rows_s, n] = size(s);
[num_rows_t, m] = size(t);
%Error checking statements:
%Make sure input vectors are horizontal.
if (num_rows_s ~= 1 | num_rows_t ~= 1)
error('Error: s and t must be horizontal vectors.');
end;
%If p is less than zero or not a number, program should quit due to faulty variable input.
if p <= 0 | ischar(p)
error('Error: p needs to be a number greater than 0.');
end;
%If p is greater than either string, answer is intuitively zero
if p > n | p > m
ans = 0;
return;
end;
%End of error checking
%Find all p-mers of s, puts them in s_array
j = 1;
for i=1:(n-p+1)
s_array{j} = s(i:(i+p-1));
j = j + 1;
end;
%Find all p-mers of t, puts them in t_array
j = 1;
for i=1:(m-p+1)
t_array{j} = t(i:(i+p-1));
j = j + 1;
end;
%initialize variable for final answer
k_count = 0;
ans = traverse(s_array, t_array, 0, p, k_count);
return;
%----------------------------------------------------------------------------------------------
function k_count = traverse(s_array, t_array, depth, max_depth, k_count)
%TRAVERSE
% -Function called my trie(). Type 'help trie' for more information.
% The following is the pseudo-code used by this program:
%
% traverse(currentInstances, depth) {
% if depth == MAX_DEPTH { // i.e. depth == p
% update kernel with contribution from currentInstances;
% } //if
% else {
% for each symbol in alphabet {
% for each inst in currentInstances {
% if inst[depth] == symbol
% store inst in new instances;
% } //for
% traverse(newInstances, depth + 1)
% } //for
% } //else
% } //traverse()
%Written and tested in Matlab 6.0, Release 12.
%Copyright 2003, Manju M. Pai 5/2003
%manjupai@hotmail.com
%----------------------------------------------------------------------------------------------
%Obtain lengths of both arrays
n = length(s_array);
m = length(t_array);
if depth == max_depth % i.e. depth == k
k_count = k_count + (n * m); % update kernel
return;
else % need to traverse down one leaf
depth = depth + 1;
%for i='a':'z' % It's assumed that the string language is a-z,A-Z
for i = 32:126 % numeric value of all ascii characters
% The following four lines initialize the new leaves and
% also clear them for later iterations of the 'for' loop.
% They are initialized with ''...it's just a random character that
% is not part of the language.
clear new_s_leaf;
clear new_t_leaf;
new_s_leaf{1} = '';
new_t_leaf{1} = '';
% These two 'for' loops iterate through every substring in
%the leaves, making sure the letter represented by i matches the
%letter at the current depth. If so, it is passed to the new leaf.
k = 1;
for j=1:n
if (strcmp(s_array{j}(depth),char(i)))
new_s_leaf{k} = s_array{j};
k = k + 1;
end;
end;
l = 1;
for j=1:m
if (strcmp(t_array{j}(depth),char(i)))
new_t_leaf{l} = t_array{j};
l = l + 1;
end;
end;
% In the event that the position in the array with '' was never overwritten,
% it takes out that element now.
if find(strcmpi(new_s_leaf,''))
z = find(strcmpi(new_s_leaf,''));
new_s_leaf = [new_s_leaf(1:z-1), new_s_leaf(z+1:length(new_s_leaf))];
end;
if find(strcmpi(new_t_leaf,''))
z = find(strcmpi(new_t_leaf,''));
new_t_leaf = [new_t_leaf(1:z-1), new_t_leaf(z+1:length(new_t_leaf))];
end;
if(length(new_t_leaf) == 0 | length(new_s_leaf) == 0 )
result = 0; % There's no use proceeding to the next leaf if no strings are present.
else %Proceed to leaf at next depth.
k_count = traverse(new_s_leaf, new_t_leaf, depth, max_depth, k_count);
end;
end;
end;
return;