alpar@0
|
1 |
%%
|
alpar@0
|
2 |
%% Copyright 2007, 2008, 2009 Elsevier Ltd
|
alpar@0
|
3 |
%%
|
alpar@0
|
4 |
%% This file is part of the 'Elsarticle Bundle'.
|
alpar@0
|
5 |
%% ---------------------------------------------
|
alpar@0
|
6 |
%%
|
alpar@0
|
7 |
%% It may be distributed under the conditions of the LaTeX Project Public
|
alpar@0
|
8 |
%% License, either version 1.2 of this license or (at your option) any
|
alpar@0
|
9 |
%% later version. The latest version of this license is in
|
alpar@0
|
10 |
%% http://www.latex-project.org/lppl.txt
|
alpar@0
|
11 |
%% and version 1.2 or later is part of all distributions of LaTeX
|
alpar@0
|
12 |
%% version 1999/12/01 or later.
|
alpar@0
|
13 |
%%
|
alpar@0
|
14 |
%% The list of all files belonging to the 'Elsarticle Bundle' is
|
alpar@0
|
15 |
%% given in the file `manifest.txt'.
|
alpar@0
|
16 |
%%
|
alpar@0
|
17 |
|
alpar@0
|
18 |
%% Template article for Elsevier's document class `elsarticle'
|
alpar@0
|
19 |
%% with numbered style bibliographic references
|
alpar@0
|
20 |
%% SP 2008/03/01
|
alpar@0
|
21 |
|
alpar@0
|
22 |
\documentclass[preprint,12pt]{elsarticle}
|
alpar@0
|
23 |
|
alpar@0
|
24 |
%% Use the option review to obtain double line spacing
|
alpar@0
|
25 |
%% \documentclass[authoryear,preprint,review,12pt]{elsarticle}
|
alpar@0
|
26 |
|
alpar@0
|
27 |
%% Use the options 1p,twocolumn; 3p; 3p,twocolumn; 5p; or 5p,twocolumn
|
alpar@0
|
28 |
%% for a journal layout:
|
alpar@0
|
29 |
%% \documentclass[final,1p,times]{elsarticle}
|
alpar@0
|
30 |
%% \documentclass[final,1p,times,twocolumn]{elsarticle}
|
alpar@0
|
31 |
%% \documentclass[final,3p,times]{elsarticle}
|
alpar@0
|
32 |
%% \documentclass[final,3p,times,twocolumn]{elsarticle}
|
alpar@0
|
33 |
%% \documentclass[final,5p,times]{elsarticle}
|
alpar@0
|
34 |
%% \documentclass[final,5p,times,twocolumn]{elsarticle}
|
alpar@0
|
35 |
|
alpar@0
|
36 |
%% For including figures, graphicx.sty has been loaded in
|
alpar@0
|
37 |
%% elsarticle.cls. If you prefer to use the old commands
|
alpar@0
|
38 |
%% please give \usepackage{epsfig}
|
alpar@0
|
39 |
|
alpar@0
|
40 |
%% The amssymb package provides various useful mathematical symbols
|
alpar@0
|
41 |
\usepackage{amssymb}
|
alpar@0
|
42 |
%% The amsthm package provides extended theorem environments
|
alpar@0
|
43 |
%% \usepackage{amsthm}
|
alpar@0
|
44 |
|
alpar@0
|
45 |
%% The lineno packages adds line numbers. Start line numbering with
|
alpar@0
|
46 |
%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on
|
alpar@0
|
47 |
%% for the whole article with \linenumbers.
|
alpar@0
|
48 |
%% \usepackage{lineno}
|
alpar@0
|
49 |
|
alpar@2
|
50 |
\usepackage{amsmath}
|
alpar@2
|
51 |
%% \usepackage[pdftex]{graphicx}
|
alpar@2
|
52 |
|
alpar@2
|
53 |
\usepackage{pgfplots}
|
alpar@2
|
54 |
\pgfplotsset{width=9cm}
|
alpar@2
|
55 |
\pgfplotsset{compat=1.8}
|
alpar@2
|
56 |
|
alpar@2
|
57 |
\usepackage{caption}
|
alpar@2
|
58 |
\usepackage{subcaption}
|
alpar@2
|
59 |
|
alpar@2
|
60 |
\usepackage{algorithm}
|
alpar@2
|
61 |
\usepackage{algpseudocode}
|
alpar@2
|
62 |
\usepackage{tikz}
|
alpar@2
|
63 |
|
alpar@2
|
64 |
\usepackage{amsthm,amssymb}
|
alpar@2
|
65 |
\renewcommand{\qedsymbol}{\rule{0.7em}{0.7em}}
|
alpar@2
|
66 |
|
alpar@2
|
67 |
\newtheorem{theorem}{Theorem}[subsection]
|
alpar@2
|
68 |
\newtheorem{corollary}{Corollary}[theorem]
|
alpar@2
|
69 |
\newtheorem{claim}[theorem]{Claim}
|
alpar@2
|
70 |
|
alpar@2
|
71 |
\newtheorem{definition}{Definition}[subsection]
|
alpar@2
|
72 |
\newtheorem{notation}{Notation}[subsection]
|
alpar@2
|
73 |
\newtheorem{example}{Example}[subsection]
|
alpar@2
|
74 |
\usetikzlibrary{decorations.markings}
|
alpar@2
|
75 |
\let\oldproofname=\proofname
|
alpar@2
|
76 |
%% \renewcommand{\proofname}{\rm\bf{Proof:}}
|
alpar@2
|
77 |
|
Madarasi@7
|
78 |
\captionsetup{font=normalsize}
|
Madarasi@7
|
79 |
|
alpar@1
|
80 |
\journal{Discrete Applied Mathematics}
|
alpar@0
|
81 |
|
alpar@0
|
82 |
\begin{document}
|
alpar@0
|
83 |
|
alpar@0
|
84 |
\begin{frontmatter}
|
alpar@0
|
85 |
|
alpar@0
|
86 |
%% Title, authors and addresses
|
alpar@0
|
87 |
|
alpar@0
|
88 |
%% use the tnoteref command within \title for footnotes;
|
alpar@0
|
89 |
%% use the tnotetext command for theassociated footnote;
|
alpar@0
|
90 |
%% use the fnref command within \author or \address for footnotes;
|
alpar@0
|
91 |
%% use the fntext command for theassociated footnote;
|
alpar@0
|
92 |
%% use the corref command within \author for corresponding author footnotes;
|
alpar@0
|
93 |
%% use the cortext command for theassociated footnote;
|
alpar@0
|
94 |
%% use the ead command for the email address,
|
alpar@0
|
95 |
%% and the form \ead[url] for the home page:
|
alpar@0
|
96 |
%% \title{Title\tnoteref{label1}}
|
alpar@0
|
97 |
%% \tnotetext[label1]{}
|
alpar@0
|
98 |
%% \author{Name\corref{cor1}\fnref{label2}}
|
alpar@0
|
99 |
%% \ead{email address}
|
alpar@0
|
100 |
%% \ead[url]{home page}
|
alpar@0
|
101 |
%% \fntext[label2]{}
|
alpar@0
|
102 |
%% \cortext[cor1]{}
|
alpar@0
|
103 |
%% \address{Address\fnref{label3}}
|
alpar@0
|
104 |
%% \fntext[label3]{}
|
alpar@0
|
105 |
|
alpar@24
|
106 |
\title{VF2++ --- An Improved Subgraph Isomorphism Algorithm}
|
alpar@0
|
107 |
|
alpar@0
|
108 |
%% use optional labels to link authors explicitly to addresses:
|
alpar@0
|
109 |
%% \author[label1,label2]{}
|
alpar@0
|
110 |
%% \address[label1]{}
|
alpar@0
|
111 |
%% \address[label2]{}
|
alpar@0
|
112 |
|
alpar@24
|
113 |
\author[egres,elte]{Alp{\'a}r J{\"u}ttner}
|
alpar@24
|
114 |
\ead{alpar@cs.elte.hu}
|
alpar@24
|
115 |
\author[elte]{P{\'e}ter Madarasi}
|
alpar@24
|
116 |
\ead{madarasip@caesar.elte.hu}
|
alpar@24
|
117 |
\address[egres]{MTA-ELTE Egerv{\'a}ry Research Group, Budapest, Hungary.}
|
alpar@24
|
118 |
\address[elte]{Department of Operations Research, E{\"o}tv{\"o}s Lor{\'a}nd University, Budapest, Hungary.}
|
alpar@0
|
119 |
|
alpar@0
|
120 |
\begin{abstract}
|
alpar@0
|
121 |
|
alpar@24
|
122 |
This paper presents a largely improved version of the VF2 algorithm
|
alpar@24
|
123 |
for the \emph{Subgraph Isomorphism Problem}. The improvements are
|
alpar@24
|
124 |
twofold. Firstly, it is based on a new approach for determining the
|
alpar@24
|
125 |
matching order of the nodes, and secondly, more efficient -
|
alpar@24
|
126 |
nevertheless easier to compute - cutting rules significantly
|
alpar@24
|
127 |
reducing the search space are applied.
|
alpar@1
|
128 |
|
alpar@24
|
129 |
In addition to the usual subgraph isomorphism, the paper also
|
alpar@24
|
130 |
presents specialized versions for the \emph{Induced Subgraph
|
alpar@24
|
131 |
Isomorphism} and for the \emph{Graph Isomorphism Problems}.
|
alpar@1
|
132 |
|
alpar@24
|
133 |
Finally, an extensive experimental evaluation is provided using a
|
alpar@24
|
134 |
wide range of inputs, including both real life biological and
|
alpar@24
|
135 |
chemical datasets and standard randomly generated graph series. The
|
alpar@24
|
136 |
results show major and consistent running time improvements over the
|
alpar@24
|
137 |
other known methods.
|
alpar@24
|
138 |
|
alpar@24
|
139 |
The C++ implementations of the algorithms are available open source as
|
alpar@24
|
140 |
the part of the LEMON graph and network optimization library.
|
alpar@24
|
141 |
|
alpar@0
|
142 |
\end{abstract}
|
alpar@0
|
143 |
|
alpar@0
|
144 |
\begin{keyword}
|
alpar@24
|
145 |
Computational Biology, Subgraph Isomorphism Problem
|
alpar@0
|
146 |
%% keywords here, in the form: keyword \sep keyword
|
alpar@0
|
147 |
|
alpar@0
|
148 |
%% PACS codes here, in the form: \PACS code \sep code
|
alpar@0
|
149 |
|
alpar@0
|
150 |
%% MSC codes here, in the form: \MSC code \sep code
|
alpar@0
|
151 |
%% or \MSC[2008] code \sep code (2000 is the default)
|
alpar@0
|
152 |
|
alpar@0
|
153 |
\end{keyword}
|
alpar@0
|
154 |
|
alpar@0
|
155 |
\end{frontmatter}
|
alpar@0
|
156 |
|
alpar@0
|
157 |
%% \linenumbers
|
alpar@0
|
158 |
|
alpar@0
|
159 |
%% main text
|
alpar@2
|
160 |
\section{Introduction}
|
alpar@2
|
161 |
\label{sec:intro}
|
alpar@2
|
162 |
|
alpar@3
|
163 |
In the last decades, combinatorial structures, and especially graphs
|
alpar@3
|
164 |
have been considered with ever increasing interest, and applied to the
|
alpar@3
|
165 |
solution of several new and revised questions. The expressiveness,
|
alpar@3
|
166 |
the simplicity and the studiedness of graphs make them practical for
|
alpar@3
|
167 |
modelling and appear constantly in several seemingly independent
|
Madarasi@19
|
168 |
fields, such as bioinformatics and chemistry.
|
alpar@2
|
169 |
|
alpar@3
|
170 |
Complex biological systems arise from the interaction and cooperation
|
alpar@3
|
171 |
of plenty of molecular components. Getting acquainted with such
|
Madarasi@19
|
172 |
systems at the molecular level is of primary importance, since
|
alpar@3
|
173 |
protein-protein interaction, DNA-protein interaction, metabolic
|
alpar@3
|
174 |
interaction, transcription factor binding, neuronal networks, and
|
Madarasi@19
|
175 |
hormone signaling networks can be understood this way.
|
alpar@2
|
176 |
|
Madarasi@19
|
177 |
Many chemical and biological structures can easily be modeled
|
Madarasi@19
|
178 |
as graphs, for instance, a molecular structure can be
|
Madarasi@19
|
179 |
considered as a graph, whose nodes correspond to atoms and whose
|
Madarasi@19
|
180 |
edges to chemical bonds. The similarity and dissimilarity of
|
Madarasi@19
|
181 |
objects corresponding to nodes are incorporated to the model
|
Madarasi@19
|
182 |
by \emph{node labels}. Understanding such networks basically
|
Madarasi@19
|
183 |
requires finding specific subgraphs, thus calls for efficient
|
Madarasi@19
|
184 |
graph matching algorithms.
|
alpar@2
|
185 |
|
Madarasi@19
|
186 |
Other real-world fields related to some
|
Madarasi@19
|
187 |
variants of graph matching include pattern recognition
|
alpar@3
|
188 |
and machine vision \cite{HorstBunkeApplications}, symbol recognition
|
alpar@3
|
189 |
\cite{CordellaVentoSymbolRecognition}, face identification
|
alpar@3
|
190 |
\cite{JianzhuangYongFaceIdentification}. \\
|
alpar@2
|
191 |
|
alpar@3
|
192 |
Subgraph and induced subgraph matching problems are known to be
|
alpar@3
|
193 |
NP-Complete\cite{SubgraphNPC}, while the graph isomorphism problem is
|
alpar@3
|
194 |
one of the few problems in NP neither known to be in P nor
|
alpar@3
|
195 |
NP-Complete. Although polynomial time isomorphism algorithms are known
|
alpar@3
|
196 |
for various graph classes, like trees and planar
|
alpar@3
|
197 |
graphs\cite{PlanarGraphIso}, bounded valence
|
alpar@3
|
198 |
graphs\cite{BondedDegGraphIso}, interval graphs\cite{IntervalGraphIso}
|
Madarasi@19
|
199 |
or permutation graphs\cite{PermGraphIso}, and recently, an FPT algorithm has been presented for the coloured hypergraph isomorphism problem in \cite{ColoredHiperGraphIso}.
|
alpar@2
|
200 |
|
alpar@3
|
201 |
In the following, some algorithms based on other approaches are
|
Madarasi@19
|
202 |
summarized, which do not need any restrictions on the graphs. Even though,
|
alpar@3
|
203 |
an overall polynomial behaviour is not expectable from such an
|
Madarasi@19
|
204 |
alternative, it may often have good practical performance, in fact,
|
Madarasi@19
|
205 |
it might be the best choice even on a graph class for which polynomial
|
Madarasi@19
|
206 |
algorithm is known.
|
alpar@2
|
207 |
|
alpar@3
|
208 |
The first practically usable approach was due to
|
Madarasi@19
|
209 |
\emph{Ullmann}\cite{Ullmann} which is a commonly used depth-first
|
alpar@3
|
210 |
search based algorithm with a complex heuristic for reducing the
|
alpar@3
|
211 |
number of visited states. A major problem is its $\Theta(n^3)$ space
|
alpar@3
|
212 |
complexity, which makes it impractical in the case of big sparse
|
alpar@3
|
213 |
graphs.
|
alpar@2
|
214 |
|
alpar@4
|
215 |
In a recent paper, Ullmann\cite{UllmannBit} presents an
|
alpar@3
|
216 |
improved version of this algorithm based on a bit-vector solution for
|
alpar@3
|
217 |
the binary Constraint Satisfaction Problem.
|
alpar@2
|
218 |
|
Madarasi@19
|
219 |
The \emph{Nauty} algorithm\cite{Nauty} transforms the two graphs to
|
alpar@3
|
220 |
a canonical form before starting to check for the isomorphism. It has
|
alpar@3
|
221 |
been considered as one of the fastest graph isomorphism algorithms,
|
alpar@3
|
222 |
although graph categories were shown in which it takes exponentially
|
alpar@3
|
223 |
many steps. This algorithm handles only the graph isomorphism problem.
|
alpar@2
|
224 |
|
alpar@4
|
225 |
The \emph{LAD} algorithm\cite{Lad} uses a depth-first search
|
alpar@3
|
226 |
strategy and formulates the matching as a Constraint Satisfaction
|
alpar@3
|
227 |
Problem to prune the search tree. The constraints are that the mapping
|
alpar@3
|
228 |
has to be injective and edge-preserving, hence it is possible to
|
alpar@3
|
229 |
handle new matching types as well.
|
alpar@2
|
230 |
|
Madarasi@19
|
231 |
The \emph{RI} algorithm\cite{RI} and its variations are based on a
|
alpar@3
|
232 |
state space representation. After reordering the nodes of the graphs,
|
alpar@3
|
233 |
it uses some fast executable heuristic checks without using any
|
alpar@3
|
234 |
complex pruning rules. It seems to run really efficiently on graphs
|
alpar@3
|
235 |
coming from biology, and won the International Contest on Pattern
|
alpar@3
|
236 |
Search in Biological Databases\cite{Content}.
|
alpar@2
|
237 |
|
alpar@3
|
238 |
The currently most commonly used algorithm is the
|
Madarasi@19
|
239 |
\emph{VF2}\cite{VF2}, the improved version of \emph{VF}\cite{VF}, which was
|
alpar@3
|
240 |
designed for solving pattern matching and computer vision problems,
|
alpar@3
|
241 |
and has been one of the best overall algorithms for more than a
|
alpar@3
|
242 |
decade. Although, it can't be up to new specialized algorithms, it is
|
alpar@3
|
243 |
still widely used due to its simplicity and space efficiency. VF2 uses
|
alpar@3
|
244 |
a state space representation and checks some conditions in each state
|
alpar@3
|
245 |
to prune the search tree.
|
alpar@2
|
246 |
|
Madarasi@19
|
247 |
Meanwhile, another variant called \emph{VF2 Plus}\cite{VF2Plus} has
|
alpar@3
|
248 |
been published. It is considered to be as efficient as the RI
|
alpar@3
|
249 |
algorithm and has a strictly better behavior on large graphs. The
|
alpar@3
|
250 |
main idea of VF2 Plus is to precompute a heuristic node order of the
|
alpar@3
|
251 |
small graph, in which the VF2 works more efficiently.
|
alpar@2
|
252 |
|
Madarasi@19
|
253 |
This paper introduces \emph{VF2++}, a new further improved algorithm
|
Madarasi@19
|
254 |
for the graph and (induced)subgraph isomorphism problem, which uses
|
Madarasi@19
|
255 |
efficient cutting rules and determines a node order in which VF2 runs
|
Madarasi@19
|
256 |
significantly faster on practical inputs.
|
Madarasi@19
|
257 |
|
alpar@25
|
258 |
The rest of the paper is structured as
|
alpar@25
|
259 |
follows. Section~\ref{sec:ProbStat} defines the exact problems to be
|
alpar@25
|
260 |
solved, Section~\ref{sec:VF2Alg} provides a description of VF2. Based
|
alpar@25
|
261 |
on that, Section~\ref{sec:VF2ppAlg} introduces VF2++. Some technical
|
alpar@25
|
262 |
details necessary for an efficient implementation are discussed in
|
alpar@25
|
263 |
Section~\ref{sec:VF2ppImpl}. Finally, Section~\ref{sec:ExpRes}
|
alpar@25
|
264 |
provide a detailed experimental evaluation of VF2++ and its comparison
|
alpar@25
|
265 |
to the state-of-the-art algorithms.
|
Madarasi@19
|
266 |
|
alpar@25
|
267 |
It must also be mentioned that the C++ implementations of the
|
alpar@25
|
268 |
algorithms have been made available for evaluation and use under an
|
alpar@25
|
269 |
open source license as a part of LEMON\cite{LEMON} open source graph
|
alpar@25
|
270 |
library.
|
Madarasi@22
|
271 |
|
Madarasi@22
|
272 |
\section{Problem Statement}\label{sec:ProbStat}
|
Madarasi@19
|
273 |
This section provides a formal description of the problems to be
|
alpar@3
|
274 |
solved.
|
alpar@2
|
275 |
\subsection{Definitions}
|
alpar@2
|
276 |
|
Madarasi@19
|
277 |
Throughout the paper $G_{1}=(V_{1}, E_{1})$ and
|
Madarasi@19
|
278 |
$G_{2}=(V_{2}, E_{2})$ denote two undirected graphs.
|
Madarasi@19
|
279 |
|
Madarasi@19
|
280 |
\begin{definition}
|
Madarasi@19
|
281 |
$\mathcal{L}: (V_{1}\cup V_{2}) \longrightarrow K$ is a \textbf{node
|
Madarasi@19
|
282 |
label function}, where K is an arbitrary set. The elements in K
|
Madarasi@19
|
283 |
are the \textbf{node labels}. Two nodes, u and v are said to be
|
Madarasi@19
|
284 |
\textbf{equivalent} if $\mathcal{L}(u)=\mathcal{L}(v)$.
|
Madarasi@19
|
285 |
\end{definition}
|
Madarasi@19
|
286 |
|
Madarasi@19
|
287 |
For the sake of simplicity, in this paper the graph, subgraph and induced subgraph isomorphisms are defined in a more general way.
|
Madarasi@19
|
288 |
|
alpar@2
|
289 |
\begin{definition}\label{sec:ismorphic}
|
Madarasi@19
|
290 |
$G_{1}$ and $G_{2}$ are \textbf{isomorphic} (by the node label $\mathcal{L}$) if $\exists \mathfrak{m}:
|
Madarasi@19
|
291 |
V_{1} \longrightarrow V_{2}$ bijection, for which the
|
alpar@3
|
292 |
following is true:
|
alpar@2
|
293 |
\begin{center}
|
Madarasi@20
|
294 |
$\forall u\in{V_{1}} : \mathcal{L}(u)=\mathcal{L}(\mathfrak{m}(u))$ and\\
|
Madarasi@19
|
295 |
$\forall u,v\in{V_{1}} : (u,v)\in{E_{1}} \Leftrightarrow (\mathfrak{m}(u),\mathfrak{m}(v))\in{E_{2}}$
|
alpar@2
|
296 |
\end{center}
|
alpar@2
|
297 |
\end{definition}
|
Madarasi@19
|
298 |
|
alpar@2
|
299 |
\begin{definition}
|
Madarasi@19
|
300 |
$G_{1}$ is a \textbf{subgraph} of $G_{2}$ (by the node label $\mathcal{L}$) if $\exists \mathfrak{m}:
|
Madarasi@19
|
301 |
V_{1}\longrightarrow V_{2}$ injection, for which the
|
alpar@3
|
302 |
following is true:
|
alpar@2
|
303 |
\begin{center}
|
Madarasi@20
|
304 |
$\forall u\in{V_{1}} : \mathcal{L}(u)=\mathcal{L}(\mathfrak{m}(u))$ and\\
|
Madarasi@19
|
305 |
$\forall u,v \in{V_{1}} : (u,v)\in{E_{1}} \Rightarrow (\mathfrak{m}(u),\mathfrak{m}(v))\in E_{2}$
|
alpar@2
|
306 |
\end{center}
|
alpar@2
|
307 |
\end{definition}
|
alpar@2
|
308 |
|
alpar@2
|
309 |
\begin{definition}
|
Madarasi@19
|
310 |
$G_{1}$ is an \textbf{induced subgraph} of $G_{2}$ (by the node label $\mathcal{L}$) if $\exists
|
Madarasi@19
|
311 |
\mathfrak{m}: V_{1}\longrightarrow V_{2}$ injection, for which the
|
alpar@3
|
312 |
following is true:
|
alpar@2
|
313 |
\begin{center}
|
Madarasi@19
|
314 |
$\forall u\in{V_{1}} : \mathcal{L}(u)=\mathcal{L}(\mathfrak{m}(u))$ and
|
Madarasi@19
|
315 |
|
Madarasi@19
|
316 |
$\forall u,v \in{V_{1}} : (u,v)\in{E_{1}} \Leftrightarrow
|
Madarasi@19
|
317 |
(\mathfrak{m}(u),\mathfrak{m}(v))\in E_{2}$
|
alpar@2
|
318 |
\end{center}
|
alpar@2
|
319 |
\end{definition}
|
alpar@2
|
320 |
|
alpar@2
|
321 |
|
alpar@2
|
322 |
\subsection{Common problems}\label{sec:CommProb}
|
alpar@2
|
323 |
|
alpar@3
|
324 |
The focus of this paper is on two extensively studied topics, the
|
alpar@3
|
325 |
subgraph isomorphism and its variations. However, the following
|
alpar@3
|
326 |
problems also appear in many applications.
|
alpar@2
|
327 |
|
alpar@3
|
328 |
The \textbf{subgraph matching problem} is the following: is
|
Madarasi@19
|
329 |
$G_{1}$ isomorphic to any subgraph of $G_{2}$ by a given node
|
alpar@3
|
330 |
label?
|
alpar@2
|
331 |
|
alpar@3
|
332 |
The \textbf{induced subgraph matching problem} asks the same about the
|
alpar@3
|
333 |
existence of an induced subgraph.
|
alpar@2
|
334 |
|
alpar@3
|
335 |
The \textbf{graph isomorphism problem} can be defined as induced
|
alpar@3
|
336 |
subgraph matching problem where the sizes of the two graphs are equal.
|
alpar@2
|
337 |
|
Madarasi@22
|
338 |
In addition, one may want to find a \textbf{single} mapping or \textbf{enumerate} all of them.
|
alpar@2
|
339 |
|
Madarasi@22
|
340 |
Note that some authors refer to the term
|
alpar@3
|
341 |
\emph{subgraph isomorphism problem} as an \emph{induced subgraph
|
alpar@3
|
342 |
isomorphism problem}.
|
alpar@2
|
343 |
|
Madarasi@22
|
344 |
\section{The VF2 Algorithm}\label{sec:VF2Alg}
|
alpar@3
|
345 |
This algorithm is the basis of both the VF2++ and the VF2 Plus. VF2
|
alpar@4
|
346 |
is able to handle all the variations mentioned in Section
|
alpar@4
|
347 |
\ref{sec:CommProb}. Although it can also handle directed graphs,
|
alpar@3
|
348 |
for the sake of simplicity, only the undirected case will be
|
alpar@3
|
349 |
discussed.
|
alpar@2
|
350 |
|
alpar@2
|
351 |
|
alpar@2
|
352 |
\subsection{Common notations}
|
Madarasi@19
|
353 |
\indent Assume $G_{1}$ is searched in $G_{2}$. The following
|
alpar@3
|
354 |
definitions and notations will be used throughout the whole paper.
|
alpar@2
|
355 |
\begin{definition}
|
Madarasi@19
|
356 |
An injection $\mathfrak{m} : D \longrightarrow V_2$ is called (partial) \textbf{mapping}, where $D\subseteq V_1$.
|
Madarasi@19
|
357 |
\end{definition}
|
Madarasi@19
|
358 |
|
Madarasi@19
|
359 |
\begin{notation}
|
Madarasi@19
|
360 |
$\mathfrak{D}(f)$ and $\mathfrak{R}(f)$ denote the domain and the range of a function $f$, respectively.
|
Madarasi@19
|
361 |
\end{notation}
|
Madarasi@19
|
362 |
|
Madarasi@19
|
363 |
\begin{definition}
|
Madarasi@19
|
364 |
Mapping $\mathfrak{m}$ \textbf{covers} a node $u\in V_1\cup V_2$ if $u\in \mathfrak{D}(\mathfrak{m})\cup \mathfrak{R}(\mathfrak{m})$.
|
alpar@2
|
365 |
\end{definition}
|
alpar@2
|
366 |
|
alpar@2
|
367 |
\begin{definition}
|
Madarasi@19
|
368 |
A mapping $\mathfrak{m}$ is $\mathbf{whole\ mapping}$ if $\mathfrak{m}$ covers all the
|
Madarasi@19
|
369 |
nodes of $V_{1}$, i.e. $\mathfrak{D}(\mathfrak{m})=V_1$.
|
alpar@2
|
370 |
\end{definition}
|
alpar@2
|
371 |
|
alpar@2
|
372 |
\begin{definition}
|
Madarasi@20
|
373 |
Let \textbf{extend}$(\mathfrak{m},(u,v))$ denote the function $f : \mathfrak{D}(\mathfrak{m})\cup\{u\}\longrightarrow\mathfrak{R}(\mathfrak{m})\cup\{v\}$, for which $\forall w\in \mathfrak{D}(\mathfrak{m}) : \mathfrak{m}(w)=f(w)$ and $f(u)=v$ holds. Where $u\in V_1\setminus\mathfrak{D}(\mathfrak{m})$ and $v\in V_2\setminus\mathfrak{R}(\mathfrak{m})$, otherwise $extend(\mathfrak{m},(u,v))$ is undefined.
|
alpar@2
|
374 |
\end{definition}
|
alpar@2
|
375 |
|
alpar@2
|
376 |
\begin{notation}
|
Madarasi@19
|
377 |
Throughout the paper, $\mathbf{PT}$ denotes a generic problem type
|
Madarasi@19
|
378 |
which can be substituted by any of the $\mathbf{ISO}$, $\mathbf{SUB}$
|
Madarasi@19
|
379 |
and $\mathbf{IND}$ problems.
|
alpar@2
|
380 |
\end{notation}
|
alpar@2
|
381 |
|
alpar@2
|
382 |
\begin{definition}
|
Madarasi@19
|
383 |
Let $\mathfrak{m}$ be a mapping. A logical function $\mathbf{Cons_{PT}}$ is a
|
Madarasi@17
|
384 |
\textbf{consistency function by } $\mathbf{PT}$ if the following
|
Madarasi@19
|
385 |
holds. If there exists a whole mapping $w$ satisfying the requirements of $PT$, for which $\mathfrak{m}$ is exactly $w$ restricted to $\mathfrak{D}(\mathfrak{m})$.
|
alpar@2
|
386 |
\end{definition}
|
alpar@2
|
387 |
|
alpar@2
|
388 |
\begin{definition}
|
Madarasi@19
|
389 |
Let $\mathfrak{m}$ be a mapping. A logical function $\mathbf{Cut_{PT}}$ is a
|
Madarasi@17
|
390 |
\textbf{cutting function by } $\mathbf{PT}$ if the following
|
Madarasi@19
|
391 |
holds. $\mathbf{Cut_{PT}(\mathfrak{m})}$ is false if there exists a sequence of extend operations, which results in a whole mapping satisfying the requirements of $PT$.
|
alpar@2
|
392 |
\end{definition}
|
alpar@2
|
393 |
|
alpar@2
|
394 |
\begin{definition}
|
Madarasi@19
|
395 |
$\mathfrak{m}$ is said to be \textbf{consistent mapping by} $\mathbf{PT}$ if
|
Madarasi@19
|
396 |
$Cons_{PT}(\mathfrak{m})$ is true.
|
alpar@2
|
397 |
\end{definition}
|
alpar@2
|
398 |
|
alpar@2
|
399 |
$Cons_{PT}$ and $Cut_{PT}$ will often be used in the following form.
|
alpar@2
|
400 |
\begin{notation}
|
Madarasi@19
|
401 |
Let $\mathbf{Cons_{PT}(p, \mathfrak{m})}:=Cons_{PT}(extend(\mathfrak{m},p))$, and
|
Madarasi@19
|
402 |
$\mathbf{Cut_{PT}(p, \mathfrak{m})}:=Cut_{PT}(extend(\mathfrak{m},p))$, where
|
Madarasi@19
|
403 |
$p\in{V_{1}\backslash\mathfrak{D}(\mathfrak{m}) \!\times\!V_{2}\backslash\mathfrak{R}(\mathfrak{m})}$.
|
alpar@2
|
404 |
\end{notation}
|
alpar@2
|
405 |
|
alpar@3
|
406 |
$Cons_{PT}$ will be used to check the consistency of the already
|
alpar@3
|
407 |
covered nodes, while $Cut_{PT}$ is for looking ahead to recognize if
|
alpar@3
|
408 |
no whole consistent mapping can contain the current mapping.
|
alpar@2
|
409 |
|
alpar@2
|
410 |
\subsection{Overview of the algorithm}
|
alpar@3
|
411 |
VF2 uses a state space representation of mappings, $Cons_{PT}$ for
|
alpar@3
|
412 |
excluding inconsistency with the problem type and $Cut_{PT}$ for
|
Madarasi@19
|
413 |
pruning the search tree.
|
alpar@2
|
414 |
|
alpar@4
|
415 |
Algorithm~\ref{alg:VF2Pseu} is a high level description of
|
Madarasi@19
|
416 |
the VF2 matching algorithm. Each state of the matching process can
|
Madarasi@19
|
417 |
be associated with a mapping $\mathfrak{m}$. The initial state
|
Madarasi@19
|
418 |
is associated with a mapping $\mathfrak{m}$, for which
|
Madarasi@19
|
419 |
$\mathfrak{D}(\mathfrak{m})=\emptyset$, i.e. it starts with an empty mapping.
|
alpar@2
|
420 |
|
alpar@2
|
421 |
|
alpar@2
|
422 |
\begin{algorithm}
|
Madarasi@13
|
423 |
\algtext*{EndIf}%ne nyomtasson end if-et
|
Madarasi@13
|
424 |
\algtext*{EndFor}%ne
|
Madarasi@13
|
425 |
\algtext*{EndProcedure}%ne nyomtasson ..
|
alpar@2
|
426 |
\caption{\hspace{0.5cm}$A\ high\ level\ description\ of\ VF2$}\label{alg:VF2Pseu}
|
alpar@2
|
427 |
\begin{algorithmic}[1]
|
alpar@2
|
428 |
|
Madarasi@19
|
429 |
\Procedure{VF2}{Mapping $\mathfrak{m}$, ProblemType $PT$}
|
Madarasi@19
|
430 |
\If{$\mathfrak{m}$ covers
|
Madarasi@19
|
431 |
$V_{1}$} \State Output($\mathfrak{m}$)
|
Madarasi@19
|
432 |
\Else
|
Madarasi@19
|
433 |
\State Compute the set $P_\mathfrak{m}$ of the pairs candidate for inclusion
|
Madarasi@19
|
434 |
in $\mathfrak{m}$ \ForAll{$p\in{P_\mathfrak{m}}$} \If{Cons$_{PT}$($p,\mathfrak{m}$) $\wedge$
|
Madarasi@19
|
435 |
$\neg$Cut$_{PT}$($p,\mathfrak{m}$)}
|
Madarasi@19
|
436 |
\State \textbf{call}
|
Madarasi@19
|
437 |
VF2($extend(\mathfrak{m},p)$, $PT$) \EndIf \EndFor \EndIf \EndProcedure
|
alpar@2
|
438 |
\end{algorithmic}
|
alpar@2
|
439 |
\end{algorithm}
|
alpar@2
|
440 |
|
alpar@2
|
441 |
|
Madarasi@19
|
442 |
For the current mapping $\mathfrak{m}$, the algorithm computes $P_\mathfrak{m}$, the set of
|
Madarasi@19
|
443 |
candidate node pairs for adding to the current mapping $\mathfrak{m}_s$.
|
alpar@2
|
444 |
|
Madarasi@19
|
445 |
For each pair $p$ in $P_\mathfrak{m}$, $Cons_{PT}(p,\mathfrak{m})$ and
|
Madarasi@19
|
446 |
$Cut_{PT}(p,\mathfrak{m})$ are evaluated. If the former is true and
|
Madarasi@19
|
447 |
the latter is false, the whole process is recursively applied to
|
Madarasi@19
|
448 |
$extend(\mathfrak{m},p)$. Otherwise, $extend(\mathfrak{m},p)$ is not consistent by $PT$, or it
|
Madarasi@19
|
449 |
can be proved that $\mathfrak{m}$ can not be extended to a whole mapping.
|
alpar@2
|
450 |
|
Madarasi@11
|
451 |
In order to make sure of the correctness, see
|
alpar@2
|
452 |
\begin{claim}
|
alpar@3
|
453 |
Through consistent mappings, only consistent whole mappings can be
|
Madarasi@19
|
454 |
reached, and all the consistent whole mappings are reachable through
|
alpar@3
|
455 |
consistent mappings.
|
alpar@2
|
456 |
\end{claim}
|
alpar@2
|
457 |
|
Madarasi@19
|
458 |
Note that a mapping may be reached in exponentially many different ways, since the
|
Madarasi@19
|
459 |
order of extensions does not influence the nascent mapping.
|
alpar@2
|
460 |
|
alpar@2
|
461 |
However, one may observe
|
alpar@2
|
462 |
|
alpar@2
|
463 |
\begin{claim}
|
alpar@2
|
464 |
\label{claim:claimTotOrd}
|
Madarasi@19
|
465 |
Let $\prec$ be an arbitrary total ordering relation on $V_{1}$. If
|
Madarasi@19
|
466 |
the algorithm ignores each $p=(u,v) \in P_\mathfrak{m}$, for which
|
alpar@2
|
467 |
\begin{center}
|
Madarasi@19
|
468 |
$\exists (\tilde{u},\tilde{v})\in P_\mathfrak{m}: \tilde{u} \prec u$,
|
alpar@2
|
469 |
\end{center}
|
Madarasi@19
|
470 |
then no mapping can be reached more than once, and each whole mapping remains reachable.
|
alpar@2
|
471 |
\end{claim}
|
alpar@2
|
472 |
|
alpar@3
|
473 |
Note that the cornerstone of the improvements to VF2 is a proper
|
alpar@3
|
474 |
choice of a total ordering.
|
alpar@2
|
475 |
|
Madarasi@19
|
476 |
\subsection{The candidate set}
|
alpar@2
|
477 |
\label{candidateComputingVF2}
|
Madarasi@19
|
478 |
Let $P_\mathfrak{m}$ be the set of the candidate pairs for inclusion in $\mathfrak{m}$.
|
alpar@2
|
479 |
|
alpar@2
|
480 |
\begin{notation}
|
Madarasi@19
|
481 |
Let $\mathbf{T_{1}(\mathfrak{m})}:=\{u \in V_{1}\backslash\mathfrak{D}(\mathfrak{m}) : \exists \tilde{u}\in{\mathfrak{D}(\mathfrak{m}): (u,\tilde{u})\in E_{1}}\}$, and
|
Madarasi@19
|
482 |
$\mathbf{T_{2}(\mathfrak{m})} := \{v \in V_{2}\backslash\mathfrak{R}(\mathfrak{m}) : \exists\tilde{v}\in{\mathfrak{R}(\mathfrak{m}):(v,\tilde{v})\in E_{2}}\}$.
|
alpar@2
|
483 |
\end{notation}
|
alpar@2
|
484 |
|
Madarasi@19
|
485 |
The set $P_\mathfrak{m}$ includes the pairs of uncovered neighbours of covered
|
Madarasi@17
|
486 |
nodes, and if there is not such a node pair, all the pairs containing
|
alpar@3
|
487 |
two uncovered nodes are added. Formally, let
|
alpar@2
|
488 |
\[
|
Madarasi@19
|
489 |
P_\mathfrak{m}\!=\!
|
alpar@2
|
490 |
\begin{cases}
|
Madarasi@19
|
491 |
T_{1}(\mathfrak{m})\times T_{2}(\mathfrak{m})&\hspace{-0.15cm}\text{if }
|
Madarasi@19
|
492 |
T_{1}(\mathfrak{m})\!\neq\!\emptyset\ \text{and }T_{2}(\mathfrak{m})\!\neq
|
Madarasi@19
|
493 |
\emptyset,\\ (V_{1}\!\setminus\!\mathfrak{D}(\mathfrak{m}))\!\times\!(V_{2}\!\setminus\!\mathfrak{R}(\mathfrak{m}))
|
Madarasi@19
|
494 |
&\hspace{-0.15cm}\text{otherwise}.
|
alpar@2
|
495 |
\end{cases}
|
alpar@2
|
496 |
\]
|
alpar@2
|
497 |
|
alpar@2
|
498 |
\subsection{Consistency}
|
Madarasi@19
|
499 |
Suppose $p=(u,v)$, where $u\in V_{1}$ and $v\in V_{2}$, $\mathfrak{m}$ is a consistent mapping by
|
Madarasi@19
|
500 |
$PT$. $Cons_{PT}(p,\mathfrak{m})$ checks whether
|
Madarasi@19
|
501 |
including pair $p$ into $\mathfrak{m}$ leads to a consistent mapping by $PT$.
|
Madarasi@15
|
502 |
|
Madarasi@15
|
503 |
For example, the consistency function of induced subgraph isomorphism is as follows.
|
alpar@2
|
504 |
\begin{notation}
|
Madarasi@19
|
505 |
Let $\mathbf{\Gamma_{1} (u)}:=\{\tilde{u}\in V_{1} :
|
Madarasi@19
|
506 |
(u,\tilde{u})\in E_{1}\}$, and $\mathbf{\Gamma_{2}
|
Madarasi@19
|
507 |
(v)}:=\{\tilde{v}\in V_{2} : (v,\tilde{v})\in E_{2}\}$, where $u\in V_{1}$ and $v\in V_{2}$.
|
alpar@2
|
508 |
\end{notation}
|
alpar@2
|
509 |
|
Madarasi@19
|
510 |
$extend(\mathfrak{m},(u,v))$ is a consistent mapping by $IND$ $\Leftrightarrow
|
Madarasi@19
|
511 |
(\forall \tilde{u}\in \mathfrak{D}(\mathfrak{m}): (u,\tilde{u})\in E_{1}
|
Madarasi@19
|
512 |
\Leftrightarrow (v,\mathfrak{m}(\tilde{u}))\in E_{2})$. The
|
alpar@3
|
513 |
following formulation gives an efficient way of calculating
|
alpar@3
|
514 |
$Cons_{IND}$.
|
alpar@2
|
515 |
\begin{claim}
|
Madarasi@19
|
516 |
$Cons_{IND}((u,v),\mathfrak{m}):=\mathcal{L}(u)\!\!=\!\!\mathcal{L}(v)\wedge(\forall \tilde{v}\in \Gamma_{2}(v)\cap\mathfrak{R}(\mathfrak{m}):(u,\mathfrak{m}^{-1}(\tilde{v}))\in E_{1})\wedge
|
Madarasi@19
|
517 |
(\forall \tilde{u}\in \Gamma_{1}(u)
|
Madarasi@19
|
518 |
\cap \mathfrak{D}(\mathfrak{m}):(v,\mathfrak{m}(\tilde{u}))\in E_{2})$ is a
|
alpar@3
|
519 |
consistency function in the case of $IND$.
|
alpar@2
|
520 |
\end{claim}
|
alpar@2
|
521 |
|
alpar@2
|
522 |
\subsection{Cutting rules}
|
Madarasi@19
|
523 |
$Cut_{PT}(p,\mathfrak{m})$ is defined by a collection of efficiently
|
Madarasi@19
|
524 |
verifiable conditions. The requirement is that $Cut_{PT}(p,\mathfrak{m})$ can
|
Madarasi@19
|
525 |
be true only if it is impossible to extend $extend(\mathfrak{m},p)$ to a
|
alpar@3
|
526 |
whole mapping.
|
Madarasi@15
|
527 |
|
Madarasi@15
|
528 |
As an example, the cutting function of induced subgraph isomorphism is presented.
|
alpar@2
|
529 |
\begin{notation}
|
Madarasi@19
|
530 |
Let $\mathbf{\tilde{T}_{1}}(\mathfrak{m}):=(V_{1}\backslash
|
Madarasi@19
|
531 |
\mathfrak{D}(\mathfrak{m}))\backslash T_{1}(\mathfrak{m})$, and
|
Madarasi@19
|
532 |
\\ $\mathbf{\tilde{T}_{2}}(\mathfrak{m}):=(V_{2}\backslash
|
Madarasi@19
|
533 |
\mathfrak{R}(\mathfrak{m}))\backslash T_{2}(\mathfrak{m})$.
|
alpar@2
|
534 |
\end{notation}
|
Madarasi@15
|
535 |
|
alpar@2
|
536 |
\begin{claim}
|
Madarasi@19
|
537 |
$Cut_{IND}((u,v),\mathfrak{m}):= |\Gamma_{2} (v)\ \cap\ T_{2}(\mathfrak{m})| <
|
Madarasi@19
|
538 |
|\Gamma_{1} (u)\ \cap\ T_{1}(\mathfrak{m})| \vee |\Gamma_{2}(v)\cap
|
Madarasi@19
|
539 |
\tilde{T}_{2}(\mathfrak{m})| < |\Gamma_{1}(u)\cap
|
Madarasi@19
|
540 |
\tilde{T}_{1}(\mathfrak{m})|$ is a cutting function by $IND$.
|
alpar@2
|
541 |
\end{claim}
|
alpar@2
|
542 |
|
Madarasi@22
|
543 |
\section{The VF2++ Algorithm}\label{sec:VF2ppAlg}
|
alpar@3
|
544 |
Although any total ordering relation makes the search space of VF2 a
|
alpar@3
|
545 |
tree, its choice turns out to dramatically influence the number of
|
alpar@3
|
546 |
visited states. The goal is to determine an efficient one as quickly
|
alpar@3
|
547 |
as possible.
|
alpar@2
|
548 |
|
alpar@3
|
549 |
The main reason for VF2++' superiority over VF2 is twofold. Firstly,
|
alpar@3
|
550 |
taking into account the structure and the node labeling of the graph,
|
alpar@3
|
551 |
VF2++ determines a state order in which most of the unfruitful
|
alpar@3
|
552 |
branches of the search space can be pruned immediately. Secondly,
|
alpar@3
|
553 |
introducing more efficient --- nevertheless still easier to compute
|
alpar@3
|
554 |
--- cutting rules reduces the chance of going astray even further.
|
alpar@2
|
555 |
|
alpar@3
|
556 |
In addition to the usual subgraph isomorphism, specialized versions
|
alpar@3
|
557 |
for induced subgraph isomorphism and for graph isomorphism have been
|
Madarasi@22
|
558 |
designed.
|
alpar@2
|
559 |
|
Madarasi@22
|
560 |
Note that a weaker version of the cutting rules and an efficient
|
Madarasi@22
|
561 |
candidate set calculating were described in \cite{VF2Plus}.
|
alpar@2
|
562 |
|
alpar@3
|
563 |
It should be noted that all the methods described in this section are
|
Madarasi@22
|
564 |
extendable to handle directed graphs and edge labels as well.
|
alpar@3
|
565 |
The basic ideas and the detailed description of VF2++ are provided in
|
Madarasi@22
|
566 |
the following.\newline
|
alpar@2
|
567 |
|
Madarasi@19
|
568 |
The goal is to find a matching order in which the algorithm is able to
|
Madarasi@19
|
569 |
recognize inconsistency or prune the infeasible branches on the
|
Madarasi@19
|
570 |
highest levels and goes deep only if it is needed.
|
Madarasi@19
|
571 |
|
Madarasi@19
|
572 |
\begin{notation}
|
Madarasi@19
|
573 |
Let $\mathbf{Conn_{H}(u)}:=|\Gamma_{1}(u)\cap H\}|$, that is the
|
Madarasi@19
|
574 |
number of neighbours of u which are in H, where $u\in V_{1} $ and
|
Madarasi@19
|
575 |
$H\subseteq V_{1}$.
|
Madarasi@19
|
576 |
\end{notation}
|
Madarasi@19
|
577 |
|
Madarasi@19
|
578 |
The principal question is the following. Suppose a mapping $\mathfrak{m}$ is
|
Madarasi@19
|
579 |
given. For which node of $T_{1}(\mathfrak{m})$ is the hardest to find a
|
Madarasi@19
|
580 |
consistent pair in $G_{2}$? The more covered neighbours a node in
|
Madarasi@19
|
581 |
$T_{1}(\mathfrak{m})$ has --- i.e. the largest $Conn_{\mathfrak{D}(\mathfrak{m})}$ it has
|
Madarasi@19
|
582 |
---, the more rarely satisfiable consistency constraints for its pair
|
Madarasi@19
|
583 |
are given.
|
Madarasi@19
|
584 |
|
Madarasi@19
|
585 |
In biology, most of the graphs are sparse, thus several nodes in
|
Madarasi@19
|
586 |
$T_{1}(\mathfrak{m})$ may have the same $Conn_{\mathfrak{D}(\mathfrak{m})}$, which makes
|
Madarasi@19
|
587 |
reasonable to define a secondary and a tertiary order between them.
|
Madarasi@19
|
588 |
The observation above proves itself to be as determining, that the
|
Madarasi@19
|
589 |
secondary ordering prefers nodes with the most uncovered neighbours
|
Madarasi@19
|
590 |
among which have the same $Conn_{\mathfrak{D}(\mathfrak{m})}$ to increase
|
Madarasi@19
|
591 |
$Conn_{\mathfrak{D}(\mathfrak{m})}$ of uncovered nodes so much, as possible. The
|
Madarasi@19
|
592 |
tertiary ordering prefers nodes having the rarest uncovered labels.
|
Madarasi@19
|
593 |
|
Madarasi@19
|
594 |
Note that the secondary ordering is the same as the ordering by $deg$,
|
Madarasi@19
|
595 |
which is a static data in front of the above used.
|
Madarasi@19
|
596 |
|
Madarasi@19
|
597 |
These rules can easily result in a matching order which contains the
|
Madarasi@19
|
598 |
nodes of a long path successively, whose nodes may have low $Conn$ and
|
Madarasi@19
|
599 |
is easily matchable into $G_{2}$. To avoid that, a BFS order is
|
Madarasi@19
|
600 |
used, which provides the shortest possible paths.
|
Madarasi@19
|
601 |
\newline
|
Madarasi@19
|
602 |
|
Madarasi@19
|
603 |
In the following, some examples on which the VF2 may be slow are
|
Madarasi@19
|
604 |
described, although they are easily solvable by using a proper
|
Madarasi@19
|
605 |
matching order.
|
Madarasi@19
|
606 |
|
Madarasi@19
|
607 |
\begin{example}
|
Madarasi@19
|
608 |
Suppose $G_{1}$ can be mapped into $G_{2}$ in many ways
|
Madarasi@19
|
609 |
without node labels. Let $u\in V_{1}$ and $v\in V_{2}$.
|
Madarasi@19
|
610 |
\newline
|
Madarasi@19
|
611 |
$\mathcal{L}(u):=black$
|
Madarasi@19
|
612 |
\newline
|
Madarasi@19
|
613 |
$\mathcal{L}(v):=black$
|
Madarasi@19
|
614 |
\newline
|
Madarasi@22
|
615 |
$\mathcal{L}(\tilde{u}):=red \ \forall \tilde{u}\in V_{1}\backslash
|
Madarasi@22
|
616 |
\{u\}$
|
Madarasi@19
|
617 |
\newline
|
Madarasi@22
|
618 |
$\mathcal{L}(\tilde{v}):=red \ \forall \tilde{v}\in V_{2}\backslash
|
Madarasi@22
|
619 |
\{v\}$
|
Madarasi@19
|
620 |
\newline
|
Madarasi@19
|
621 |
|
Madarasi@19
|
622 |
Now, any mapping by $\mathcal{L}$ must contain $(u,v)$, since
|
Madarasi@19
|
623 |
$u$ is black and no node in $V_{2}$ has a black label except
|
Madarasi@19
|
624 |
$v$. If unfortunately $u$ were the last node which will get covered,
|
Madarasi@19
|
625 |
VF2 would check only in the last steps, whether $u$ can be matched to
|
Madarasi@19
|
626 |
$v$.
|
Madarasi@19
|
627 |
\newline
|
Madarasi@19
|
628 |
However, had $u$ been the first matched node, u would have been
|
Madarasi@19
|
629 |
matched immediately to v, so all the mappings would have been
|
Madarasi@19
|
630 |
precluded in which node labels can not correspond.
|
Madarasi@19
|
631 |
\end{example}
|
Madarasi@19
|
632 |
|
Madarasi@19
|
633 |
\begin{example}
|
Madarasi@19
|
634 |
Suppose there is no node label given, $G_{1}$ is a small graph and
|
Madarasi@19
|
635 |
can not be mapped into $G_{2}$ and $u\in V_{1}$.
|
Madarasi@19
|
636 |
\newline
|
Madarasi@19
|
637 |
Let $G'_{1}:=(V_{1}\cup
|
Madarasi@19
|
638 |
\{u'_{1},u'_{2},..,u'_{k}\},E_{1}\cup
|
Madarasi@19
|
639 |
\{(u,u'_{1}),(u'_{1},u'_{2}),..,(u'_{k-1},u'_{k})\})$, that is,
|
Madarasi@19
|
640 |
$G'_{1}$ is $G_{1}\cup \{ a\ k$ long path, which is disjoint
|
Madarasi@19
|
641 |
from $G_{1}$ and one of its starting points is connected to $u\in
|
Madarasi@19
|
642 |
V_{1}\}$.
|
Madarasi@19
|
643 |
\newline
|
Madarasi@19
|
644 |
Is there a subgraph of $G_{2}$, which is isomorph with
|
Madarasi@19
|
645 |
$G'_{1}$?
|
Madarasi@19
|
646 |
\newline
|
Madarasi@19
|
647 |
If unfortunately the nodes of the path were the first $k$ nodes in the
|
Madarasi@19
|
648 |
matching order, the algorithm would iterate through all the possible k
|
Madarasi@19
|
649 |
long paths in $G_{2}$, and it would recognize that no path can be
|
Madarasi@19
|
650 |
extended to $G'_{1}$.
|
Madarasi@19
|
651 |
\newline
|
Madarasi@19
|
652 |
However, had it started by the matching of $G_{1}$, it would not
|
Madarasi@19
|
653 |
have matched any nodes of the path.
|
Madarasi@19
|
654 |
\end{example}
|
Madarasi@19
|
655 |
|
Madarasi@19
|
656 |
These examples may look artificial, but the same problems also appear
|
Madarasi@19
|
657 |
in real-world instances, even though in a less obvious way.
|
Madarasi@19
|
658 |
|
alpar@2
|
659 |
\subsection{Preparations}
|
alpar@2
|
660 |
\begin{claim}
|
alpar@2
|
661 |
\label{claim:claimCoverFromLeft}
|
alpar@3
|
662 |
The total ordering relation uniquely determines a node order, in which
|
Madarasi@19
|
663 |
the nodes of $V_{1}$ will be covered by VF2. From the point of
|
alpar@3
|
664 |
view of the matching procedure, this means, that always the same node
|
Madarasi@19
|
665 |
of $G_{1}$ will be covered on the d-th level.
|
alpar@2
|
666 |
\end{claim}
|
alpar@2
|
667 |
|
alpar@2
|
668 |
\begin{definition}
|
Madarasi@19
|
669 |
An order $(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(|V_{1}|)})$ of
|
Madarasi@19
|
670 |
$V_{1}$ is \textbf{matching order} if exists $\prec$ total
|
alpar@3
|
671 |
ordering relation, s.t. the VF2 with $\prec$ on the d-th level finds
|
Madarasi@19
|
672 |
pair for $u_{\sigma(d)}$ for all $d\in\{1,..,|V_{1}|\}$.
|
alpar@2
|
673 |
\end{definition}
|
alpar@2
|
674 |
|
alpar@2
|
675 |
\begin{claim}\label{claim:MOclaim}
|
Madarasi@17
|
676 |
A total ordering is matching order iff the nodes of every component
|
alpar@3
|
677 |
form an interval in the node sequence, and every node connects to a
|
Madarasi@17
|
678 |
previous node in its component except the first node of each component.
|
alpar@2
|
679 |
\end{claim}
|
alpar@2
|
680 |
|
alpar@3
|
681 |
To summing up, a total ordering always uniquely determines a matching
|
alpar@3
|
682 |
order, and every matching order can be determined by a total ordering,
|
alpar@3
|
683 |
however, more than one different total orderings may determine the
|
alpar@3
|
684 |
same matching order.
|
alpar@2
|
685 |
|
alpar@2
|
686 |
\subsection{Total ordering}
|
Madarasi@19
|
687 |
The matching order will be searched directly.
|
alpar@2
|
688 |
\begin{notation}
|
Madarasi@19
|
689 |
Let \textbf{F$_\mathcal{M}$(l)}$:=|\{v\in V_{2} :
|
Madarasi@19
|
690 |
l=\mathcal{L}(v)\}|-|\{u\in V_{1}\backslash \mathcal{M} : l=\mathcal{L}(u)\}|$ ,
|
Madarasi@19
|
691 |
where $l$ is a label and $\mathcal{M}\subseteq V_{1}$.
|
alpar@2
|
692 |
\end{notation}
|
alpar@2
|
693 |
|
Madarasi@17
|
694 |
\begin{definition}Let $\mathbf{arg\ max}_{f}(S) :=\{u\in S : f(u)=max_{v\in S}\{f(v)\}\}$ and $\mathbf{arg\ min}_{f}(S) := arg\ max_{-f}(S)$, where $S$ is a finite set and $f:S\longrightarrow \mathbb{R}$.
|
alpar@2
|
695 |
\end{definition}
|
alpar@2
|
696 |
|
alpar@2
|
697 |
\begin{algorithm}
|
Madarasi@8
|
698 |
\algtext*{EndIf}
|
Madarasi@8
|
699 |
\algtext*{EndProcedure}
|
alpar@2
|
700 |
\algtext*{EndWhile}
|
Madarasi@13
|
701 |
\algtext*{EndFor}
|
alpar@2
|
702 |
\caption{\hspace{0.5cm}$The\ method\ of\ VF2++\ for\ determining\ the\ node\ order$}\label{alg:VF2PPPseu}
|
alpar@2
|
703 |
\begin{algorithmic}[1]
|
alpar@3
|
704 |
\Procedure{VF2++order}{} \State $\mathcal{M}$ := $\emptyset$
|
Madarasi@19
|
705 |
\Comment{matching order} \While{$V_{1}\backslash \mathcal{M}
|
alpar@3
|
706 |
\neq\emptyset$} \State $r\in$ arg max$_{deg}$ (arg
|
Madarasi@19
|
707 |
min$_{F_\mathcal{M}\circ \mathcal{L}}(V_{1}\backslash
|
alpar@3
|
708 |
\mathcal{M})$)\label{alg:findMin} \State Compute $T$, a BFS tree with
|
alpar@3
|
709 |
root node $r$. \For{$d=0,1,...,depth(T)$} \State $V_d$:=nodes of the
|
alpar@3
|
710 |
$d$-th level \State Process $V_d$ \Comment{See Algorithm
|
Madarasi@8
|
711 |
\ref{alg:VF2PPProcess1}} \EndFor
|
alpar@3
|
712 |
\EndWhile \EndProcedure
|
alpar@2
|
713 |
\end{algorithmic}
|
alpar@2
|
714 |
\end{algorithm}
|
alpar@2
|
715 |
|
alpar@2
|
716 |
\begin{algorithm}
|
Madarasi@8
|
717 |
\algtext*{EndIf}
|
Madarasi@8
|
718 |
\algtext*{EndProcedure}%ne nyomtasson ..
|
alpar@2
|
719 |
\algtext*{EndWhile}
|
Madarasi@8
|
720 |
\caption{\hspace{.5cm}$The\ method\ for\ processing\ a\ level\ of\ the\ BFS\ tree$}\label{alg:VF2PPProcess1}
|
alpar@2
|
721 |
\begin{algorithmic}[1]
|
Madarasi@17
|
722 |
\Procedure{VF2++ProcessLevel}{$V_{d}$} \While{$V_d\neq\emptyset$}
|
Madarasi@22
|
723 |
\State $m\in$ arg min$_{F_{\mathcal{M}\circ\ \mathcal{L}}}($ arg max$_{deg}($arg
|
alpar@3
|
724 |
max$_{Conn_{\mathcal{M}}}(V_{d})))$ \State $V_d:=V_d\backslash m$
|
alpar@3
|
725 |
\State Append node $m$ to the end of $\mathcal{M}$ \State Refresh
|
alpar@3
|
726 |
$F_\mathcal{M}$ \EndWhile \EndProcedure
|
alpar@2
|
727 |
\end{algorithmic}
|
alpar@2
|
728 |
\end{algorithm}
|
alpar@2
|
729 |
|
alpar@4
|
730 |
Algorithm~\ref{alg:VF2PPPseu} is a high level description of the
|
alpar@4
|
731 |
matching order procedure of VF2++. It computes a BFS tree for each
|
Madarasi@19
|
732 |
component in ascending order of their rarest node labels and largest $deg$,
|
alpar@4
|
733 |
whose root vertex is the component's minimal
|
Madarasi@8
|
734 |
node. Algorithm~\ref{alg:VF2PPProcess1} is a method to process a level of the BFS tree, which appends the nodes of the current level in descending
|
Madarasi@8
|
735 |
lexicographic order by $(Conn_{\mathcal{M}},deg,-F_\mathcal{M})$ separately
|
Madarasi@8
|
736 |
to $\mathcal{M}$, and refreshes $F_\mathcal{M}$ immediately.
|
alpar@2
|
737 |
|
alpar@4
|
738 |
Claim~\ref{claim:MOclaim} shows that Algorithm~\ref{alg:VF2PPPseu}
|
alpar@4
|
739 |
provides a matching order.
|
alpar@2
|
740 |
|
alpar@2
|
741 |
|
alpar@2
|
742 |
\subsection{Cutting rules}
|
alpar@2
|
743 |
\label{VF2PPCuttingRules}
|
Madarasi@19
|
744 |
This section presents the cutting rules of VF2++, which are improved by using extra information coming from the node labels.
|
alpar@2
|
745 |
\begin{notation}
|
Madarasi@19
|
746 |
Let $\mathbf{\Gamma_{1}^{l}(u)}:=\{\tilde{u} : \mathcal{L}(\tilde{u})=l
|
Madarasi@19
|
747 |
\wedge \tilde{u}\in \Gamma_{1} (u)\}$ and
|
Madarasi@19
|
748 |
$\mathbf{\Gamma_{2}^{l}(v)}:=\{\tilde{v} : \mathcal{L}(\tilde{v})=l \wedge
|
Madarasi@19
|
749 |
\tilde{v}\in \Gamma_{2} (v)\}$, where $u\in V_{1}$, $v\in
|
Madarasi@19
|
750 |
V_{2}$ and $l$ is a label.
|
alpar@2
|
751 |
\end{notation}
|
alpar@2
|
752 |
|
Madarasi@19
|
753 |
\subsubsection{Induced subgraph isomorphism}
|
alpar@2
|
754 |
\begin{claim}
|
Madarasi@19
|
755 |
\[LabCut_{IND}((u,v),\mathfrak{m}):=\bigvee_{l\ is\ label}|\Gamma_{2}^{l} (v) \cap T_{2}(\mathfrak{m})|\!<\!|\Gamma_{1}^{l}(u)\cap T_{1}(\mathfrak{m})|\ \vee\]\[\bigvee_{l\ is\ label} \newline |\Gamma_{2}^{l}(v)\cap \tilde{T}_{2}(\mathfrak{m})| < |\Gamma_{1}^{l}(u)\cap \tilde{T}_{1}(\mathfrak{m})|\] is a cutting function by IND.
|
Madarasi@19
|
756 |
\end{claim}
|
Madarasi@19
|
757 |
\subsubsection{Graph isomorphism}
|
Madarasi@19
|
758 |
\begin{claim}
|
Madarasi@19
|
759 |
\[LabCut_{ISO}((u,v),\mathfrak{m}):=\bigvee_{l\ is\ label}|\Gamma_{2}^{l} (v) \cap T_{2}(\mathfrak{m})|\!\neq\!|\Gamma_{1}^{l}(u)\cap T_{1}(\mathfrak{m})|\ \vee\]\[\bigvee_{l\ is\ label} \newline |\Gamma_{2}^{l}(v)\cap \tilde{T}_{2}(\mathfrak{m})| \neq |\Gamma_{1}^{l}(u)\cap \tilde{T}_{1}(\mathfrak{m})|\] is a cutting function by ISO.
|
alpar@2
|
760 |
\end{claim}
|
Madarasi@13
|
761 |
|
Madarasi@19
|
762 |
\subsubsection{Subgraph isomorphism}
|
Madarasi@19
|
763 |
\begin{claim}
|
Madarasi@19
|
764 |
\[LabCut_{SU\!B}((u,v),\mathfrak{m}):=\bigvee_{l\ is\ label}|\Gamma_{2}^{l} (v) \cap T_{2}(\mathfrak{m})|\!<\!|\Gamma_{1}^{l}(u)\cap T_{1}(\mathfrak{m})|\] is a cutting function by SUB.
|
Madarasi@19
|
765 |
\end{claim}
|
alpar@2
|
766 |
|
Madarasi@19
|
767 |
|
Madarasi@19
|
768 |
|
Madarasi@22
|
769 |
\section{Implementation details}\label{sec:VF2ppImpl}
|
alpar@3
|
770 |
This section provides a detailed summary of an efficient
|
alpar@3
|
771 |
implementation of VF2++.
|
Madarasi@22
|
772 |
\subsection{Storing a mapping}
|
alpar@3
|
773 |
After fixing an arbitrary node order ($u_0, u_1, ..,
|
Madarasi@19
|
774 |
u_{|G_{1}|-1}$) of $G_{1}$, an array $M$ is usable to store
|
alpar@3
|
775 |
the current mapping in the following way.
|
alpar@2
|
776 |
\[
|
alpar@3
|
777 |
M[i] =
|
alpar@2
|
778 |
\begin{cases}
|
Madarasi@19
|
779 |
v & if\ (u_i,v)\ is\ in\ the\ mapping\\ INV\!ALI\!D &
|
Madarasi@17
|
780 |
if\ no\ node\ has\ been\ mapped\ to\ u_i,
|
alpar@2
|
781 |
\end{cases}
|
alpar@2
|
782 |
\]
|
Madarasi@19
|
783 |
where $i\in\{0,1, ..,|G_{1}|-1\}$, $v\in V_{2}$ and $INV\!ALI\!D$
|
alpar@3
|
784 |
means "no node".
|
Madarasi@22
|
785 |
\subsection{Avoiding the recurrence}
|
alpar@4
|
786 |
The recursion of Algorithm~\ref{alg:VF2Pseu} can be realized
|
Madarasi@9
|
787 |
as a \textit{while loop}, which has a loop counter $depth$ denoting the
|
Madarasi@9
|
788 |
all-time depth of the recursion. Fixing a matching order, let $M$
|
Madarasi@9
|
789 |
denote the array storing the all-time mapping. Based on Claim~\ref{claim:claimCoverFromLeft},
|
Madarasi@19
|
790 |
$M$ is $INV\!ALI\!D$ from index $depth$+1 and not $INV\!ALI\!D$ before
|
Madarasi@9
|
791 |
$depth$. $M[depth]$ changes
|
alpar@3
|
792 |
while the state is being processed, but the property is held before
|
alpar@3
|
793 |
both stepping back to a predecessor state and exploring a successor
|
alpar@3
|
794 |
state.
|
alpar@2
|
795 |
|
alpar@3
|
796 |
The necessary part of the candidate set is easily maintainable or
|
alpar@3
|
797 |
computable by following
|
alpar@4
|
798 |
Section~\ref{candidateComputingVF2}. A much faster method
|
alpar@3
|
799 |
has been designed for biological- and sparse graphs, see the next
|
alpar@3
|
800 |
section for details.
|
alpar@2
|
801 |
|
Madarasi@22
|
802 |
\subsection{Calculating the candidates for a node}
|
alpar@4
|
803 |
Being aware of Claim~\ref{claim:claimCoverFromLeft}, the
|
alpar@3
|
804 |
task is not to maintain the candidate set, but to generate the
|
Madarasi@19
|
805 |
candidate nodes in $G_{2}$ for a given node $u\in V_{1}$. In
|
Madarasi@20
|
806 |
case of any of the three problem types and a mapping $\mathfrak{m}$, if a node $v\in
|
Madarasi@19
|
807 |
V_{2}$ is a potential pair of $u\in V_{1}$, then $\forall
|
Madarasi@20
|
808 |
u'\in \mathfrak{D}(\mathfrak{m}) : (u,u')\in
|
Madarasi@20
|
809 |
E_{1}\Rightarrow (v,\mathfrak{m}(u'))\in
|
Madarasi@19
|
810 |
E_{2}$. That is, each covered neighbour of $u$ has to be mapped to
|
alpar@3
|
811 |
a covered neighbour of $v$.
|
alpar@2
|
812 |
|
alpar@3
|
813 |
Having said that, an algorithm running in $\Theta(deg)$ time is
|
alpar@3
|
814 |
describable if there exists a covered node in the component containing
|
Madarasi@17
|
815 |
$u$, and a linear one otherwise.
|
alpar@2
|
816 |
|
alpar@2
|
817 |
|
Madarasi@22
|
818 |
\subsection{Determining the node order}
|
alpar@3
|
819 |
This section describes how the node order preprocessing method of
|
alpar@3
|
820 |
VF2++ can efficiently be implemented.
|
alpar@2
|
821 |
|
alpar@3
|
822 |
For using lookup tables, the node labels are associated with the
|
alpar@3
|
823 |
numbers $\{0,1,..,|K|-1\}$, where $K$ is the set of the labels. It
|
Madarasi@9
|
824 |
enables $F_\mathcal{M}$ to be stored in an array. At first, the node order
|
alpar@3
|
825 |
$\mathcal{M}=\emptyset$, so $F_\mathcal{M}[i]$ is the number of nodes
|
Madarasi@19
|
826 |
in $V_{1}$ having label $i$, which is easy to compute in
|
Madarasi@19
|
827 |
$\Theta(|V_{1}|)$ steps.
|
alpar@2
|
828 |
|
Madarasi@19
|
829 |
Representing $\mathcal{M}\subseteq V_{1}$ as an array of
|
Madarasi@19
|
830 |
size $|V_{1}|$, both the computation of the BFS tree, and processing its levels by Algorithm~\ref{alg:VF2PPProcess1} can be done inplace by swapping nodes.
|
alpar@2
|
831 |
|
Madarasi@22
|
832 |
\subsection{Cutting rules}
|
alpar@4
|
833 |
In Section~\ref{VF2PPCuttingRules}, the cutting rules were
|
Madarasi@19
|
834 |
described using the sets $T_{1}$, $T_{2}$, $\tilde T_{1}$
|
Madarasi@19
|
835 |
and $\tilde T_{2}$, which are dependent on the all-time mapping
|
alpar@3
|
836 |
(i.e. on the all-time state). The aim is to check the labeled cutting
|
alpar@3
|
837 |
rules of VF2++ in $\Theta(deg)$ time.
|
alpar@2
|
838 |
|
alpar@3
|
839 |
Firstly, suppose that these four sets are given in such a way, that
|
alpar@3
|
840 |
checking whether a node is in a certain set takes constant time,
|
alpar@3
|
841 |
e.g. they are given by their 0-1 characteristic vectors. Let $L$ be an
|
alpar@3
|
842 |
initially zero integer lookup table of size $|K|$. After incrementing
|
Madarasi@19
|
843 |
$L[\mathcal{L}(u')]$ for all $u'\in \Gamma_{1}(u) \cap T_{1}(\mathfrak{m})$ and
|
Madarasi@19
|
844 |
decrementing $L[\mathcal{L}(v')]$ for all $v'\in\Gamma_{2} (v) \cap
|
Madarasi@19
|
845 |
T_{2}(s)$, the first part of the cutting rules is checkable in
|
alpar@3
|
846 |
$\Theta(deg)$ time by considering the proper signs of $L$. Setting $L$
|
alpar@3
|
847 |
to zero takes $\Theta(deg)$ time again, which makes it possible to use
|
Madarasi@9
|
848 |
the same table through the whole algorithm. The second part of the
|
alpar@3
|
849 |
cutting rules can be verified using the same method with $\tilde
|
Madarasi@19
|
850 |
T_{1}$ and $\tilde T_{2}$ instead of $T_{1}$ and
|
Madarasi@19
|
851 |
$T_{2}$. Thus, the overall complexity is $\Theta(deg)$.
|
alpar@2
|
852 |
|
Madarasi@19
|
853 |
Another integer lookup table storing the number of covered neighbours
|
Madarasi@19
|
854 |
of each node in $G_{2}$ gives all the information about the sets
|
Madarasi@19
|
855 |
$T_{2}$ and $\tilde T_{2}$, which is maintainable in
|
alpar@3
|
856 |
$\Theta(deg)$ time when a pair is added or substracted by incrementing
|
alpar@3
|
857 |
or decrementing the proper indices. A further improvement is that the
|
Madarasi@19
|
858 |
values of $L[\mathcal{L}(u')]$ in case of checking $u$ are dependent only on
|
Madarasi@19
|
859 |
$u$, i.e. on the size of the mapping, so for each $u\in V_{1}$ an
|
alpar@3
|
860 |
array of pairs (label, number of such labels) can be stored to skip
|
alpar@3
|
861 |
the maintaining operations. Note that these arrays are at most of size
|
Madarasi@19
|
862 |
$deg$.
|
alpar@2
|
863 |
|
Madarasi@19
|
864 |
Using similar techniques, the consistency function can be evaluated in
|
alpar@3
|
865 |
$\Theta(deg)$ steps, as well.
|
alpar@2
|
866 |
|
Madarasi@22
|
867 |
\section{Experimental results}\label{sec:ExpRes}
|
Madarasi@19
|
868 |
This section compares the performance of VF2++ and VF2 Plus. According to
|
Madarasi@19
|
869 |
our experience, both algorithms run faster than VF2 with orders of
|
Madarasi@19
|
870 |
magnitude, thus its inclusion was not reasonable.
|
alpar@2
|
871 |
|
Madarasi@19
|
872 |
The algorithms were implemented in C++ using the open source
|
Madarasi@19
|
873 |
LEMON graph and network optimization library\cite{LEMON}. The test were carried out on a linux based system with an Intel i7 X980 3.33 GHz CPU and 6 GB of RAM.
|
alpar@2
|
874 |
\subsection{Biological graphs}
|
alpar@3
|
875 |
The tests have been executed on a recent biological dataset created
|
alpar@3
|
876 |
for the International Contest on Pattern Search in Biological
|
Madarasi@7
|
877 |
Databases\cite{Content}, which has been constructed of molecule,
|
Madarasi@7
|
878 |
protein and contact map graphs extracted from the Protein Data
|
alpar@3
|
879 |
Bank\cite{ProteinDataBank}.
|
alpar@2
|
880 |
|
alpar@3
|
881 |
The molecule dataset contains small graphs with less than 100 nodes
|
alpar@3
|
882 |
and an average degree of less than 3. The protein dataset contains
|
alpar@3
|
883 |
graphs having 500-10 000 nodes and an average degree of 4, while the
|
alpar@3
|
884 |
contact map dataset contains graphs with 150-800 nodes and an average
|
alpar@3
|
885 |
degree of 20. \\
|
alpar@2
|
886 |
|
Madarasi@19
|
887 |
In the following, both the induced subgraph isomorphism and the graph
|
alpar@3
|
888 |
isomorphism will be examined.
|
alpar@2
|
889 |
|
Madarasi@17
|
890 |
This dataset provides graph pairs, between which all the induced subgraph isomorphisms have to be found. For runtime results, please see Figure~\ref{fig:bioIND}.
|
Madarasi@7
|
891 |
|
Madarasi@7
|
892 |
In an other experiment, the nodes of each graph in the database had been
|
Madarasi@7
|
893 |
shuffled, and an isomorphism between the shuffled and the original
|
Madarasi@7
|
894 |
graph was searched. The solution times are shown on Figure~\ref{fig:bioISO}.
|
Madarasi@7
|
895 |
|
Madarasi@7
|
896 |
|
Madarasi@17
|
897 |
\begin{figure}[H]
|
Madarasi@17
|
898 |
\vspace*{-2cm}
|
Madarasi@17
|
899 |
\hspace*{-1.5cm}
|
Madarasi@17
|
900 |
\begin{subfigure}[b]{0.55\textwidth}
|
Madarasi@17
|
901 |
\begin{figure}[H]
|
Madarasi@17
|
902 |
\begin{tikzpicture}[trim axis left, trim axis right]
|
Madarasi@17
|
903 |
\begin{axis}[title=Molecules IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
|
Madarasi@17
|
904 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
Madarasi@17
|
905 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
Madarasi@17
|
906 |
format/1000 sep = \thinspace}]
|
Madarasi@17
|
907 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
Madarasi@17
|
908 |
\addplot table {Orig/Molecules.32.txt}; \addplot[mark=triangle*,mark
|
Madarasi@17
|
909 |
size=1.8pt,color=red] table {VF2PPLabel/Molecules.32.txt};
|
Madarasi@17
|
910 |
\end{axis}
|
Madarasi@17
|
911 |
\end{tikzpicture}
|
Madarasi@17
|
912 |
\caption{In the case of molecules, the algorithms have
|
Madarasi@17
|
913 |
similar behaviour, but VF2++ is almost two times faster even on such
|
Madarasi@17
|
914 |
small graphs.} \label{fig:INDMolecule}
|
Madarasi@17
|
915 |
\end{figure}
|
Madarasi@17
|
916 |
\end{subfigure}
|
Madarasi@17
|
917 |
\hspace*{1.5cm}
|
Madarasi@17
|
918 |
\begin{subfigure}[b]{0.55\textwidth}
|
Madarasi@17
|
919 |
\begin{figure}[H]
|
Madarasi@17
|
920 |
\begin{tikzpicture}[trim axis left, trim axis right]
|
Madarasi@17
|
921 |
\begin{axis}[title=Contact maps IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
|
Madarasi@17
|
922 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
Madarasi@17
|
923 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
Madarasi@17
|
924 |
format/1000 sep = \thinspace}]
|
Madarasi@17
|
925 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
Madarasi@17
|
926 |
\addplot table {Orig/ContactMaps.128.txt};
|
Madarasi@17
|
927 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
Madarasi@17
|
928 |
{VF2PPLabel/ContactMaps.128.txt};
|
Madarasi@17
|
929 |
\end{axis}
|
Madarasi@17
|
930 |
\end{tikzpicture}
|
Madarasi@17
|
931 |
\caption{On contact maps, VF2++ runs almost in constant time, while VF2
|
Madarasi@17
|
932 |
Plus has a near linear behaviour.} \label{fig:INDContact}
|
Madarasi@17
|
933 |
\end{figure}
|
Madarasi@17
|
934 |
\end{subfigure}
|
Madarasi@17
|
935 |
|
Madarasi@17
|
936 |
\begin{center}
|
Madarasi@17
|
937 |
\vspace*{-0.5cm}
|
Madarasi@17
|
938 |
\begin{subfigure}[b]{0.55\textwidth}
|
Madarasi@17
|
939 |
\begin{figure}[H]
|
Madarasi@17
|
940 |
\begin{tikzpicture}[trim axis left, trim axis right]
|
Madarasi@17
|
941 |
\begin{axis}[title=Proteins IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
|
Madarasi@17
|
942 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
Madarasi@17
|
943 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
Madarasi@17
|
944 |
format/1000 sep = \thinspace}] %\addplot+[only marks] table
|
Madarasi@17
|
945 |
{proteinsOrig.txt}; \addplot[mark=*,mark size=1.2pt,color=blue]
|
Madarasi@17
|
946 |
table {Orig/Proteins.256.txt}; \addplot[mark=triangle*,mark
|
Madarasi@17
|
947 |
size=1.8pt,color=red] table {VF2PPLabel/Proteins.256.txt};
|
Madarasi@17
|
948 |
\end{axis}
|
Madarasi@17
|
949 |
\end{tikzpicture}
|
Madarasi@17
|
950 |
\caption{Both the algorithms have linear behaviour on protein
|
Madarasi@17
|
951 |
graphs. VF2++ is more than 10 times faster than VF2
|
Madarasi@17
|
952 |
Plus.} \label{fig:INDProt}
|
Madarasi@17
|
953 |
\end{figure}
|
Madarasi@17
|
954 |
\end{subfigure}
|
Madarasi@17
|
955 |
\end{center}
|
Madarasi@17
|
956 |
\vspace*{-0.5cm}
|
Madarasi@17
|
957 |
\caption{\normalsize{Induced subgraph isomorphism on biological graphs}}\label{fig:bioIND}
|
Madarasi@17
|
958 |
\end{figure}
|
Madarasi@17
|
959 |
|
alpar@2
|
960 |
|
alpar@2
|
961 |
\begin{figure}[H]
|
Madarasi@7
|
962 |
\vspace*{-2cm}
|
Madarasi@7
|
963 |
\hspace*{-1.5cm}
|
Madarasi@7
|
964 |
\begin{subfigure}[b]{0.55\textwidth}
|
Madarasi@7
|
965 |
\begin{figure}[H]
|
Madarasi@7
|
966 |
\begin{tikzpicture}[trim axis left, trim axis right]
|
Madarasi@7
|
967 |
\begin{axis}[title=Molecules ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
|
Madarasi@7
|
968 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
Madarasi@7
|
969 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
Madarasi@7
|
970 |
format/1000 sep = \thinspace}]
|
Madarasi@7
|
971 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
Madarasi@7
|
972 |
\addplot table {Orig/moleculesIso.txt}; \addplot[mark=triangle*,mark
|
Madarasi@7
|
973 |
size=1.8pt,color=red] table {VF2PPLabel/moleculesIso.txt};
|
Madarasi@7
|
974 |
\end{axis}
|
Madarasi@7
|
975 |
\end{tikzpicture}
|
Madarasi@7
|
976 |
\caption{In the case of molecules, there is not such a significant
|
Madarasi@7
|
977 |
difference, but VF2++ seems to be faster as the number of nodes
|
Madarasi@7
|
978 |
increases.}\label{fig:ISOMolecule}
|
Madarasi@7
|
979 |
\end{figure}
|
Madarasi@7
|
980 |
\end{subfigure}
|
Madarasi@7
|
981 |
\hspace*{1.5cm}
|
Madarasi@7
|
982 |
\begin{subfigure}[b]{0.55\textwidth}
|
Madarasi@7
|
983 |
\begin{figure}[H]
|
Madarasi@7
|
984 |
\begin{tikzpicture}[trim axis left, trim axis right]
|
Madarasi@7
|
985 |
\begin{axis}[title=Contact maps ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
|
Madarasi@7
|
986 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
Madarasi@7
|
987 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
Madarasi@7
|
988 |
format/1000 sep = \thinspace}]
|
Madarasi@7
|
989 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
Madarasi@7
|
990 |
\addplot table {Orig/contactMapsIso.txt}; \addplot[mark=triangle*,mark
|
Madarasi@7
|
991 |
size=1.8pt,color=red] table {VF2PPLabel/contactMapsIso.txt};
|
Madarasi@7
|
992 |
\end{axis}
|
Madarasi@7
|
993 |
\end{tikzpicture}
|
Madarasi@7
|
994 |
\caption{The results are closer to each other on contact maps, but
|
Madarasi@7
|
995 |
VF2++ still performs consistently better.}\label{fig:ISOContact}
|
Madarasi@7
|
996 |
\end{figure}
|
Madarasi@7
|
997 |
\end{subfigure}
|
Madarasi@7
|
998 |
|
alpar@2
|
999 |
\begin{center}
|
Madarasi@7
|
1000 |
\vspace*{-0.5cm}
|
Madarasi@7
|
1001 |
\begin{subfigure}[b]{0.55\textwidth}
|
Madarasi@7
|
1002 |
\begin{figure}[H]
|
Madarasi@7
|
1003 |
\begin{tikzpicture}[trim axis left, trim axis right]
|
Madarasi@7
|
1004 |
\begin{axis}[title=Proteins ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
|
Madarasi@7
|
1005 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
Madarasi@7
|
1006 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
Madarasi@7
|
1007 |
format/1000 sep = \thinspace}]
|
Madarasi@7
|
1008 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
Madarasi@7
|
1009 |
\addplot table {Orig/proteinsIso.txt}; \addplot[mark=triangle*,mark
|
Madarasi@7
|
1010 |
size=1.8pt,color=red] table {VF2PPLabel/proteinsIso.txt};
|
Madarasi@7
|
1011 |
\end{axis}
|
Madarasi@7
|
1012 |
\end{tikzpicture}
|
Madarasi@7
|
1013 |
\caption{On protein graphs, VF2 Plus has a super linear time
|
Madarasi@7
|
1014 |
complexity, while VF2++ runs in near constant time. The difference
|
Madarasi@7
|
1015 |
is about two order of magnitude on large graphs.}\label{fig:ISOProt}
|
Madarasi@7
|
1016 |
\end{figure}
|
Madarasi@7
|
1017 |
\end{subfigure}
|
Madarasi@7
|
1018 |
\end{center}
|
Madarasi@7
|
1019 |
\vspace*{-0.6cm}
|
Madarasi@17
|
1020 |
\caption{\normalsize{Graph isomorphism on biological graphs}}\label{fig:bioISO}
|
Madarasi@7
|
1021 |
\end{figure}
|
Madarasi@7
|
1022 |
|
Madarasi@7
|
1023 |
|
alpar@2
|
1024 |
|
alpar@2
|
1025 |
|
alpar@2
|
1026 |
\subsection{Random graphs}
|
alpar@3
|
1027 |
This section compares VF2++ with VF2 Plus on random graphs of a large
|
alpar@3
|
1028 |
size. The node labels are uniformly distributed. Let $\delta$ denote
|
alpar@3
|
1029 |
the average degree. For the parameters of problems solved in the
|
alpar@3
|
1030 |
experiments, please see the top of each chart.
|
alpar@2
|
1031 |
\subsubsection{Graph isomorphism}
|
alpar@3
|
1032 |
To evaluate the efficiency of the algorithms in the case of graph
|
Madarasi@17
|
1033 |
isomorphism, random connected graphs of less than 20 000 nodes have been
|
alpar@3
|
1034 |
considered. Generating a random graph and shuffling its nodes, an
|
Madarasi@7
|
1035 |
isomorphism had to be found. Figure \ref{fig:randISO} shows the runtime results
|
alpar@4
|
1036 |
on graph sets of various density.
|
alpar@2
|
1037 |
|
Madarasi@7
|
1038 |
|
Madarasi@7
|
1039 |
|
Madarasi@7
|
1040 |
|
Madarasi@12
|
1041 |
\begin{figure}
|
Madarasi@7
|
1042 |
\vspace*{-1.5cm}
|
Madarasi@7
|
1043 |
\hspace*{-1.5cm}
|
Madarasi@7
|
1044 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1045 |
\begin{center}
|
alpar@2
|
1046 |
\begin{tikzpicture}
|
Madarasi@7
|
1047 |
\begin{axis}[title={Random ISO, $\delta = 5$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1048 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1049 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
Madarasi@7
|
1050 |
format/1000 sep = \space}]
|
alpar@2
|
1051 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1052 |
\addplot table {randGraph/iso/vf2pIso5_1.txt};
|
alpar@3
|
1053 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1054 |
{randGraph/iso/vf2ppIso5_1.txt};
|
alpar@2
|
1055 |
\end{axis}
|
alpar@2
|
1056 |
\end{tikzpicture}
|
alpar@2
|
1057 |
\end{center}
|
Madarasi@7
|
1058 |
\end{subfigure}
|
Madarasi@7
|
1059 |
%\hspace{1cm}
|
Madarasi@7
|
1060 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1061 |
\begin{center}
|
alpar@2
|
1062 |
\begin{tikzpicture}
|
Madarasi@7
|
1063 |
\begin{axis}[title={Random ISO, $\delta = 10$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1064 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1065 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
Madarasi@7
|
1066 |
format/1000 sep = \space}]
|
alpar@2
|
1067 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1068 |
\addplot table {randGraph/iso/vf2pIso10_1.txt};
|
alpar@3
|
1069 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1070 |
{randGraph/iso/vf2ppIso10_1.txt};
|
alpar@2
|
1071 |
\end{axis}
|
alpar@2
|
1072 |
\end{tikzpicture}
|
alpar@2
|
1073 |
\end{center}
|
Madarasi@7
|
1074 |
\end{subfigure}
|
Madarasi@7
|
1075 |
%%\hspace{1cm}
|
Madarasi@7
|
1076 |
\hspace*{-1.5cm}
|
Madarasi@7
|
1077 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1078 |
\begin{center}
|
alpar@2
|
1079 |
\begin{tikzpicture}
|
Madarasi@7
|
1080 |
\begin{axis}[title={Random ISO, $\delta = 15$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1081 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1082 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
Madarasi@7
|
1083 |
format/1000 sep = \space}]
|
alpar@2
|
1084 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1085 |
\addplot table {randGraph/iso/vf2pIso15_1.txt};
|
alpar@3
|
1086 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1087 |
{randGraph/iso/vf2ppIso15_1.txt};
|
alpar@2
|
1088 |
\end{axis}
|
alpar@2
|
1089 |
\end{tikzpicture}
|
alpar@2
|
1090 |
\end{center}
|
Madarasi@7
|
1091 |
\end{subfigure}
|
Madarasi@7
|
1092 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1093 |
\begin{center}
|
alpar@2
|
1094 |
\begin{tikzpicture}
|
Madarasi@7
|
1095 |
\begin{axis}[title={Random ISO, $\delta = 100$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1096 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1097 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1098 |
format/1000 sep = \thinspace}]
|
alpar@2
|
1099 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1100 |
\addplot table {randGraph/iso/vf2pIso100_1.txt};
|
alpar@3
|
1101 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1102 |
{randGraph/iso/vf2ppIso100_1.txt};
|
alpar@2
|
1103 |
\end{axis}
|
alpar@2
|
1104 |
\end{tikzpicture}
|
Madarasi@23
|
1105 |
\end{center}
|
Madarasi@7
|
1106 |
\end{subfigure}
|
alpar@2
|
1107 |
\vspace*{-0.8cm}
|
Madarasi@23
|
1108 |
\caption{ISO on random graphs.
|
Madarasi@23
|
1109 |
}\label{fig:randISO}
|
alpar@2
|
1110 |
\end{figure}
|
alpar@2
|
1111 |
|
alpar@2
|
1112 |
|
alpar@2
|
1113 |
\subsubsection{Induced subgraph isomorphism}
|
Madarasi@17
|
1114 |
This section presents a comparison of VF2++ and VF2 Plus in the case
|
alpar@3
|
1115 |
of induced subgraph isomorphism. In addition to the size of the large
|
alpar@3
|
1116 |
graph, that of the small graph dramatically influences the hardness of
|
alpar@3
|
1117 |
a given problem too, so the overall picture is provided by examining
|
alpar@3
|
1118 |
small graphs of various size.
|
alpar@2
|
1119 |
|
Madarasi@17
|
1120 |
For each chart, a number $0<\rho< 1$ has been fixed, and the following
|
Madarasi@19
|
1121 |
has been executed 150 times. Generating a large graph $G_{2}$ of an average degree of $\delta$,
|
Madarasi@19
|
1122 |
choose 10 of its induced subgraphs having $\rho\ |V_{2}|$ nodes,
|
alpar@3
|
1123 |
and for all the 10 subgraphs find a mapping by using both the graph
|
alpar@3
|
1124 |
matching algorithms. The $\delta = 5, 10, 35$ and $\rho = 0.05, 0.1,
|
Madarasi@23
|
1125 |
0.3, 0.8$ cases have been examined, see
|
alpar@4
|
1126 |
Figure~\ref{fig:randIND5}, \ref{fig:randIND10} and
|
Madarasi@10
|
1127 |
\ref{fig:randIND35}.
|
alpar@2
|
1128 |
|
alpar@2
|
1129 |
|
alpar@2
|
1130 |
|
alpar@2
|
1131 |
|
alpar@2
|
1132 |
|
Madarasi@12
|
1133 |
\begin{figure}
|
Madarasi@7
|
1134 |
\vspace*{-1.5cm}
|
Madarasi@7
|
1135 |
\hspace*{-1.5cm}
|
alpar@2
|
1136 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1137 |
\begin{center}
|
alpar@2
|
1138 |
\begin{tikzpicture}
|
alpar@2
|
1139 |
\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1140 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1141 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1142 |
format/1000 sep = \space}]
|
alpar@2
|
1143 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1144 |
\addplot table {randGraph/ind/vf2pInd5_0.05.txt};
|
alpar@3
|
1145 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1146 |
{randGraph/ind/vf2ppInd5_0.05.txt};
|
alpar@2
|
1147 |
\end{axis}
|
alpar@2
|
1148 |
\end{tikzpicture}
|
alpar@2
|
1149 |
\end{center}
|
alpar@2
|
1150 |
\end{subfigure}
|
alpar@2
|
1151 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1152 |
\begin{center}
|
alpar@2
|
1153 |
\begin{tikzpicture}
|
alpar@2
|
1154 |
\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1155 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1156 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1157 |
format/1000 sep = \space}]
|
alpar@2
|
1158 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1159 |
\addplot table {randGraph/ind/vf2pInd5_0.1.txt};
|
alpar@3
|
1160 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1161 |
{randGraph/ind/vf2ppInd5_0.1.txt};
|
alpar@2
|
1162 |
\end{axis}
|
alpar@2
|
1163 |
\end{tikzpicture}
|
alpar@2
|
1164 |
\end{center}
|
alpar@2
|
1165 |
\end{subfigure}
|
Madarasi@7
|
1166 |
\hspace*{-1.5cm}
|
alpar@2
|
1167 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1168 |
\begin{center}
|
alpar@2
|
1169 |
\begin{tikzpicture}
|
alpar@2
|
1170 |
\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1171 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1172 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1173 |
format/1000 sep = \space}]
|
alpar@2
|
1174 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1175 |
\addplot table {randGraph/ind/vf2pInd5_0.3.txt};
|
alpar@3
|
1176 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1177 |
{randGraph/ind/vf2ppInd5_0.3.txt};
|
alpar@2
|
1178 |
\end{axis}
|
alpar@2
|
1179 |
\end{tikzpicture}
|
alpar@2
|
1180 |
\end{center}
|
alpar@2
|
1181 |
\end{subfigure}
|
alpar@2
|
1182 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1183 |
\begin{center}
|
alpar@2
|
1184 |
\begin{tikzpicture}
|
alpar@2
|
1185 |
\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1186 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1187 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1188 |
format/1000 sep = \space}]
|
alpar@2
|
1189 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1190 |
\addplot table {randGraph/ind/vf2pInd5_0.8.txt};
|
alpar@3
|
1191 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1192 |
{randGraph/ind/vf2ppInd5_0.8.txt};
|
alpar@2
|
1193 |
\end{axis}
|
alpar@2
|
1194 |
\end{tikzpicture}
|
Madarasi@23
|
1195 |
\end{center}
|
alpar@2
|
1196 |
\end{subfigure}
|
alpar@2
|
1197 |
\vspace*{-0.8cm}
|
alpar@3
|
1198 |
\caption{IND on graphs having an average degree of
|
alpar@3
|
1199 |
5.}\label{fig:randIND5}
|
alpar@2
|
1200 |
\end{figure}
|
alpar@2
|
1201 |
|
alpar@2
|
1202 |
|
Madarasi@23
|
1203 |
\begin{figure}
|
Madarasi@7
|
1204 |
\vspace*{-1.5cm}
|
Madarasi@7
|
1205 |
\hspace*{-1.5cm}
|
alpar@2
|
1206 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1207 |
\begin{center}
|
Madarasi@7
|
1208 |
\hspace*{-0.5cm}
|
alpar@2
|
1209 |
\begin{tikzpicture}
|
alpar@2
|
1210 |
\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1211 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1212 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1213 |
format/1000 sep = \space}]
|
alpar@2
|
1214 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1215 |
\addplot table {randGraph/ind/vf2pInd10_0.05.txt};
|
alpar@3
|
1216 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1217 |
{randGraph/ind/vf2ppInd10_0.05.txt};
|
alpar@2
|
1218 |
\end{axis}
|
alpar@2
|
1219 |
\end{tikzpicture}
|
alpar@2
|
1220 |
\end{center}
|
alpar@2
|
1221 |
\end{subfigure}
|
alpar@2
|
1222 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1223 |
\begin{center}
|
Madarasi@7
|
1224 |
\hspace*{-0.5cm}
|
alpar@2
|
1225 |
\begin{tikzpicture}
|
alpar@2
|
1226 |
\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1227 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1228 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1229 |
format/1000 sep = \space}]
|
alpar@2
|
1230 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1231 |
\addplot table {randGraph/ind/vf2pInd10_0.1.txt};
|
alpar@3
|
1232 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1233 |
{randGraph/ind/vf2ppInd10_0.1.txt};
|
alpar@2
|
1234 |
\end{axis}
|
alpar@2
|
1235 |
\end{tikzpicture}
|
alpar@2
|
1236 |
\end{center}
|
alpar@2
|
1237 |
\end{subfigure}
|
Madarasi@7
|
1238 |
\hspace*{-1.5cm}
|
alpar@2
|
1239 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1240 |
\begin{center}
|
alpar@2
|
1241 |
\begin{tikzpicture}
|
alpar@2
|
1242 |
\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1243 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1244 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1245 |
format/1000 sep = \space}]
|
alpar@2
|
1246 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1247 |
\addplot table {randGraph/ind/vf2pInd10_0.3.txt};
|
alpar@3
|
1248 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1249 |
{randGraph/ind/vf2ppInd10_0.3.txt};
|
alpar@2
|
1250 |
\end{axis}
|
alpar@2
|
1251 |
\end{tikzpicture}
|
alpar@2
|
1252 |
\end{center}
|
alpar@2
|
1253 |
\end{subfigure}
|
alpar@2
|
1254 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1255 |
\begin{center}
|
alpar@2
|
1256 |
\begin{tikzpicture}
|
alpar@2
|
1257 |
\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1258 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1259 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1260 |
format/1000 sep = \space}]
|
alpar@2
|
1261 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1262 |
\addplot table {randGraph/ind/vf2pInd10_0.8.txt};
|
alpar@3
|
1263 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1264 |
{randGraph/ind/vf2ppInd10_0.8.txt};
|
alpar@2
|
1265 |
\end{axis}
|
alpar@2
|
1266 |
\end{tikzpicture}
|
Madarasi@23
|
1267 |
\end{center}
|
alpar@2
|
1268 |
\end{subfigure}
|
alpar@2
|
1269 |
\vspace*{-0.8cm}
|
alpar@3
|
1270 |
\caption{IND on graphs having an average degree of
|
alpar@3
|
1271 |
10.}\label{fig:randIND10}
|
alpar@2
|
1272 |
\end{figure}
|
alpar@2
|
1273 |
|
alpar@2
|
1274 |
|
alpar@2
|
1275 |
|
Madarasi@23
|
1276 |
\begin{figure}
|
Madarasi@7
|
1277 |
\vspace*{-1.5cm}
|
Madarasi@7
|
1278 |
\hspace*{-1.5cm}
|
alpar@2
|
1279 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1280 |
\begin{center}
|
alpar@2
|
1281 |
\begin{tikzpicture}
|
alpar@2
|
1282 |
\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1283 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1284 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1285 |
format/1000 sep = \space}]
|
alpar@2
|
1286 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1287 |
\addplot table {randGraph/ind/vf2pInd35_0.05.txt};
|
alpar@3
|
1288 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1289 |
{randGraph/ind/vf2ppInd35_0.05.txt};
|
alpar@2
|
1290 |
\end{axis}
|
alpar@2
|
1291 |
\end{tikzpicture}
|
alpar@2
|
1292 |
\end{center}
|
alpar@2
|
1293 |
\end{subfigure}
|
alpar@2
|
1294 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1295 |
\begin{center}
|
alpar@2
|
1296 |
\begin{tikzpicture}
|
alpar@2
|
1297 |
\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1298 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1299 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1300 |
format/1000 sep = \space}]
|
alpar@2
|
1301 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1302 |
\addplot table {randGraph/ind/vf2pInd35_0.1.txt};
|
alpar@3
|
1303 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1304 |
{randGraph/ind/vf2ppInd35_0.1.txt};
|
alpar@2
|
1305 |
\end{axis}
|
alpar@2
|
1306 |
\end{tikzpicture}
|
alpar@2
|
1307 |
\end{center}
|
alpar@2
|
1308 |
\end{subfigure}
|
Madarasi@7
|
1309 |
\hspace*{-1.5cm}
|
alpar@2
|
1310 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1311 |
\begin{center}
|
alpar@2
|
1312 |
\begin{tikzpicture}
|
alpar@2
|
1313 |
\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1314 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1315 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1316 |
format/1000 sep = \space}]
|
alpar@2
|
1317 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1318 |
\addplot table {randGraph/ind/vf2pInd35_0.3.txt};
|
alpar@3
|
1319 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1320 |
{randGraph/ind/vf2ppInd35_0.3.txt};
|
alpar@2
|
1321 |
\end{axis}
|
alpar@2
|
1322 |
\end{tikzpicture}
|
alpar@2
|
1323 |
\end{center}
|
alpar@2
|
1324 |
\end{subfigure}
|
alpar@2
|
1325 |
\begin{subfigure}[b]{0.55\textwidth}
|
alpar@2
|
1326 |
\begin{center}
|
alpar@2
|
1327 |
\begin{tikzpicture}
|
alpar@2
|
1328 |
\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
|
alpar@3
|
1329 |
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
|
alpar@3
|
1330 |
west},scaled x ticks = false,x tick label style={/pgf/number
|
alpar@3
|
1331 |
format/1000 sep = \space}]
|
alpar@2
|
1332 |
%\addplot+[only marks] table {proteinsOrig.txt};
|
alpar@2
|
1333 |
\addplot table {randGraph/ind/vf2pInd35_0.8.txt};
|
alpar@3
|
1334 |
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
|
alpar@3
|
1335 |
{randGraph/ind/vf2ppInd35_0.8.txt};
|
alpar@2
|
1336 |
\end{axis}
|
alpar@2
|
1337 |
\end{tikzpicture}
|
Madarasi@23
|
1338 |
\end{center}
|
alpar@2
|
1339 |
\end{subfigure}
|
alpar@2
|
1340 |
\vspace*{-0.8cm}
|
alpar@3
|
1341 |
\caption{IND on graphs having an average degree of
|
alpar@3
|
1342 |
35.}\label{fig:randIND35}
|
alpar@2
|
1343 |
\end{figure}
|
alpar@2
|
1344 |
|
alpar@2
|
1345 |
|
alpar@3
|
1346 |
Based on these experiments, VF2++ is faster than VF2 Plus and able to
|
alpar@3
|
1347 |
handle really large graphs in milliseconds. Note that when $IND$ was
|
alpar@3
|
1348 |
considered and the small graphs had proportionally few nodes ($\rho =
|
alpar@3
|
1349 |
0.05$, or $\rho = 0.1$), then VF2 Plus produced some inefficient node
|
alpar@4
|
1350 |
orders (e.g. see the $\delta=10$ case on
|
Madarasi@17
|
1351 |
Figure~\ref{fig:randIND10}). If these instances had been excluded, the
|
alpar@3
|
1352 |
charts would have seemed to be similar to the other ones.
|
alpar@3
|
1353 |
Unsurprisingly, as denser graphs are considered, both VF2++ and VF2
|
alpar@3
|
1354 |
Plus slow slightly down, but remain practically usable even on graphs
|
alpar@3
|
1355 |
having 10 000 nodes.
|
alpar@2
|
1356 |
|
alpar@2
|
1357 |
|
alpar@2
|
1358 |
|
alpar@2
|
1359 |
|
alpar@3
|
1360 |
|
alpar@2
|
1361 |
\section{Conclusion}
|
Madarasi@19
|
1362 |
This paper presented VF2++, a new graph matching algorithm based on VF2, called VF2++, and analyzed it from a practical viewpoint.
|
alpar@2
|
1363 |
|
alpar@3
|
1364 |
Recognizing the importance of the node order and determining an
|
alpar@3
|
1365 |
efficient one, VF2++ is able to match graphs of thousands of nodes in
|
alpar@3
|
1366 |
near practically linear time including preprocessing. In addition to
|
alpar@3
|
1367 |
the proper order, VF2++ uses more efficient consistency and cutting
|
alpar@3
|
1368 |
rules which are easy to compute and make the algorithm able to prune
|
alpar@3
|
1369 |
most of the unfruitful branches without going astray.
|
alpar@2
|
1370 |
|
alpar@3
|
1371 |
In order to show the efficiency of the new method, it has been
|
Madarasi@19
|
1372 |
compared to VF2 Plus\cite{VF2Plus}, which is the best contemporary algorithm.
|
Madarasi@19
|
1373 |
.
|
alpar@2
|
1374 |
|
alpar@3
|
1375 |
The experiments show that VF2++ consistently outperforms VF2 Plus on
|
alpar@3
|
1376 |
biological graphs. It seems to be asymptotically faster on protein and
|
alpar@3
|
1377 |
on contact map graphs in the case of induced subgraph isomorphism,
|
alpar@3
|
1378 |
while in the case of graph isomorphism, it has definitely better
|
alpar@3
|
1379 |
asymptotic behaviour on protein graphs.
|
alpar@2
|
1380 |
|
alpar@3
|
1381 |
Regarding random sparse graphs, not only has VF2++ proved itself to be
|
Madarasi@19
|
1382 |
faster than VF2 Plus, but it also has a practically linear behaviour both
|
Madarasi@19
|
1383 |
in the case of induced subgraph- and graph isomorphism.
|
alpar@2
|
1384 |
|
alpar@25
|
1385 |
%%%%%%%%%%%%%%%%
|
alpar@25
|
1386 |
\section*{Acknowledgement} \label{sec:ack}
|
alpar@25
|
1387 |
%%%%%%%%%%%%%%%%
|
alpar@25
|
1388 |
This research project was initiated and sponsored by QuantumBio
|
alpar@25
|
1389 |
Inc.\cite{QUANTUMBIO}.
|
alpar@25
|
1390 |
|
alpar@25
|
1391 |
The authors were supported by the Hungarian Scientific Research Fund -
|
alpar@25
|
1392 |
OTKA, K109240 and by the J\'anos Bolyai Research Fellowship program of
|
alpar@25
|
1393 |
the Hungarian Academy of Sciences.
|
alpar@2
|
1394 |
|
alpar@0
|
1395 |
|
alpar@0
|
1396 |
%% The Appendices part is started with the command \appendix;
|
alpar@0
|
1397 |
%% appendix sections are then done as normal sections
|
alpar@0
|
1398 |
%% \appendix
|
alpar@0
|
1399 |
|
alpar@0
|
1400 |
%% \section{}
|
alpar@0
|
1401 |
%% \label{}
|
alpar@0
|
1402 |
|
alpar@0
|
1403 |
%% If you have bibdatabase file and want bibtex to generate the
|
alpar@0
|
1404 |
%% bibitems, please use
|
alpar@0
|
1405 |
%%
|
alpar@3
|
1406 |
\bibliographystyle{elsarticle-num} \bibliography{bibliography}
|
alpar@0
|
1407 |
|
alpar@0
|
1408 |
%% else use the following coding to input the bibitems directly in the
|
alpar@0
|
1409 |
%% TeX file.
|
alpar@0
|
1410 |
|
alpar@2
|
1411 |
%% \begin{thebibliography}{00}
|
alpar@0
|
1412 |
|
alpar@2
|
1413 |
%% %% \bibitem{label}
|
alpar@2
|
1414 |
%% %% Text of bibliographic item
|
alpar@0
|
1415 |
|
alpar@2
|
1416 |
%% \bibitem{}
|
alpar@0
|
1417 |
|
alpar@2
|
1418 |
%% \end{thebibliography}
|
alpar@2
|
1419 |
|
alpar@0
|
1420 |
\end{document}
|
alpar@0
|
1421 |
\endinput
|
alpar@0
|
1422 |
%%
|
alpar@0
|
1423 |
%% End of file `elsarticle-template-num.tex'.
|