damecco.tex
author Madarasi Peter
Wed, 23 Nov 2016 02:07:56 +0100
changeset 7 4989d677d5e3
parent 5 c04b029e623a
child 8 49574b75404b
permissions -rw-r--r--
Bio plots grouped
alpar@0
     1
%% 
alpar@0
     2
%% Copyright 2007, 2008, 2009 Elsevier Ltd
alpar@0
     3
%% 
alpar@0
     4
%% This file is part of the 'Elsarticle Bundle'.
alpar@0
     5
%% ---------------------------------------------
alpar@0
     6
%% 
alpar@0
     7
%% It may be distributed under the conditions of the LaTeX Project Public
alpar@0
     8
%% License, either version 1.2 of this license or (at your option) any
alpar@0
     9
%% later version.  The latest version of this license is in
alpar@0
    10
%%    http://www.latex-project.org/lppl.txt
alpar@0
    11
%% and version 1.2 or later is part of all distributions of LaTeX
alpar@0
    12
%% version 1999/12/01 or later.
alpar@0
    13
%% 
alpar@0
    14
%% The list of all files belonging to the 'Elsarticle Bundle' is
alpar@0
    15
%% given in the file `manifest.txt'.
alpar@0
    16
%% 
alpar@0
    17
alpar@0
    18
%% Template article for Elsevier's document class `elsarticle'
alpar@0
    19
%% with numbered style bibliographic references
alpar@0
    20
%% SP 2008/03/01
alpar@0
    21
alpar@0
    22
\documentclass[preprint,12pt]{elsarticle}
alpar@0
    23
alpar@0
    24
%% Use the option review to obtain double line spacing
alpar@0
    25
%% \documentclass[authoryear,preprint,review,12pt]{elsarticle}
alpar@0
    26
alpar@0
    27
%% Use the options 1p,twocolumn; 3p; 3p,twocolumn; 5p; or 5p,twocolumn
alpar@0
    28
%% for a journal layout:
alpar@0
    29
%% \documentclass[final,1p,times]{elsarticle}
alpar@0
    30
%% \documentclass[final,1p,times,twocolumn]{elsarticle}
alpar@0
    31
%% \documentclass[final,3p,times]{elsarticle}
alpar@0
    32
%% \documentclass[final,3p,times,twocolumn]{elsarticle}
alpar@0
    33
%% \documentclass[final,5p,times]{elsarticle}
alpar@0
    34
%% \documentclass[final,5p,times,twocolumn]{elsarticle}
alpar@0
    35
alpar@0
    36
%% For including figures, graphicx.sty has been loaded in
alpar@0
    37
%% elsarticle.cls. If you prefer to use the old commands
alpar@0
    38
%% please give \usepackage{epsfig}
alpar@0
    39
alpar@0
    40
%% The amssymb package provides various useful mathematical symbols
alpar@0
    41
\usepackage{amssymb}
alpar@0
    42
%% The amsthm package provides extended theorem environments
alpar@0
    43
%% \usepackage{amsthm}
alpar@0
    44
alpar@0
    45
%% The lineno packages adds line numbers. Start line numbering with
alpar@0
    46
%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on
alpar@0
    47
%% for the whole article with \linenumbers.
alpar@0
    48
%% \usepackage{lineno}
alpar@0
    49
alpar@2
    50
\usepackage{amsmath}
alpar@2
    51
%% \usepackage[pdftex]{graphicx}
alpar@2
    52
alpar@2
    53
\usepackage{pgfplots}
alpar@2
    54
\pgfplotsset{width=9cm}
alpar@2
    55
\pgfplotsset{compat=1.8}
alpar@2
    56
alpar@2
    57
\usepackage{caption}
alpar@2
    58
\usepackage{subcaption} 
alpar@2
    59
alpar@2
    60
\usepackage{algorithm}
alpar@2
    61
\usepackage{algpseudocode}
alpar@2
    62
\usepackage{tikz}
alpar@2
    63
alpar@2
    64
\usepackage{amsthm,amssymb}
alpar@2
    65
\renewcommand{\qedsymbol}{\rule{0.7em}{0.7em}}
alpar@2
    66
alpar@2
    67
\newtheorem{theorem}{Theorem}[subsection]
alpar@2
    68
\newtheorem{corollary}{Corollary}[theorem]
alpar@2
    69
\newtheorem{claim}[theorem]{Claim}
alpar@2
    70
alpar@2
    71
\newtheorem{definition}{Definition}[subsection]
alpar@2
    72
\newtheorem{notation}{Notation}[subsection]
alpar@2
    73
\newtheorem{example}{Example}[subsection]
alpar@2
    74
\usetikzlibrary{decorations.markings}
alpar@2
    75
\let\oldproofname=\proofname
alpar@2
    76
%% \renewcommand{\proofname}{\rm\bf{Proof:}}
alpar@2
    77
Madarasi@7
    78
\captionsetup{font=normalsize}
Madarasi@7
    79
alpar@1
    80
\journal{Discrete Applied Mathematics}
alpar@0
    81
alpar@0
    82
\begin{document}
alpar@0
    83
alpar@0
    84
\begin{frontmatter}
alpar@0
    85
alpar@0
    86
%% Title, authors and addresses
alpar@0
    87
alpar@0
    88
%% use the tnoteref command within \title for footnotes;
alpar@0
    89
%% use the tnotetext command for theassociated footnote;
alpar@0
    90
%% use the fnref command within \author or \address for footnotes;
alpar@0
    91
%% use the fntext command for theassociated footnote;
alpar@0
    92
%% use the corref command within \author for corresponding author footnotes;
alpar@0
    93
%% use the cortext command for theassociated footnote;
alpar@0
    94
%% use the ead command for the email address,
alpar@0
    95
%% and the form \ead[url] for the home page:
alpar@0
    96
%% \title{Title\tnoteref{label1}}
alpar@0
    97
%% \tnotetext[label1]{}
alpar@0
    98
%% \author{Name\corref{cor1}\fnref{label2}}
alpar@0
    99
%% \ead{email address}
alpar@0
   100
%% \ead[url]{home page}
alpar@0
   101
%% \fntext[label2]{}
alpar@0
   102
%% \cortext[cor1]{}
alpar@0
   103
%% \address{Address\fnref{label3}}
alpar@0
   104
%% \fntext[label3]{}
alpar@0
   105
alpar@1
   106
\title{Improved Algorithms for Matching Biological Graphs}
alpar@0
   107
alpar@0
   108
%% use optional labels to link authors explicitly to addresses:
alpar@0
   109
%% \author[label1,label2]{}
alpar@0
   110
%% \address[label1]{}
alpar@0
   111
%% \address[label2]{}
alpar@0
   112
alpar@1
   113
\author{Alp{\'a}r J{\"u}ttner and P{\'e}ter Madarasi}
alpar@0
   114
alpar@1
   115
\address{Dept of Operations Research, ELTE}
alpar@0
   116
alpar@0
   117
\begin{abstract}
alpar@1
   118
Subgraph isomorphism is a well-known NP-Complete problem, while its
alpar@1
   119
special case, the graph isomorphism problem is one of the few problems
alpar@1
   120
in NP neither known to be in P nor NP-Complete. Their appearance in
alpar@1
   121
many fields of application such as pattern analysis, computer vision
alpar@1
   122
questions and the analysis of chemical and biological systems has
alpar@1
   123
fostered the design of various algorithms for handling special graph
alpar@1
   124
structures.
alpar@0
   125
alpar@1
   126
The idea of using state space representation and checking some
alpar@1
   127
conditions in each state to prune the search tree has made the VF2
alpar@1
   128
algorithm one of the state of the art graph matching algorithms for
alpar@1
   129
more than a decade. Recently, biological questions of ever increasing
alpar@1
   130
importance have required more efficient, specialized algorithms.
alpar@1
   131
alpar@1
   132
This paper presents VF2++, a new algorithm based on the original VF2,
alpar@1
   133
which runs significantly faster on most test cases and performs
alpar@1
   134
especially well on special graph classes stemming from biological
alpar@1
   135
questions. VF2++ handles graphs of thousands of nodes in practically
alpar@1
   136
near linear time including preprocessing. Not only is it an improved
alpar@1
   137
version of VF2, but in fact, it is by far the fastest existing
alpar@1
   138
algorithm regarding biological graphs.
alpar@1
   139
alpar@1
   140
The reason for VF2++' superiority over VF2 is twofold. Firstly, taking
alpar@1
   141
into account the structure and the node labeling of the graph, VF2++
alpar@1
   142
determines a state order in which most of the unfruitful branches of
alpar@1
   143
the search space can be pruned immediately. Secondly, introducing more
alpar@1
   144
efficient - nevertheless still easier to compute - cutting rules
alpar@1
   145
reduces the chance of going astray even further.
alpar@1
   146
alpar@1
   147
In addition to the usual subgraph isomorphism, specialized versions
alpar@1
   148
for induced subgraph isomorphism and for graph isomorphism are
alpar@1
   149
presented. VF2++ has gained a runtime improvement of one order of
alpar@1
   150
magnitude respecting induced subgraph isomorphism and a better
alpar@1
   151
asymptotical behaviour in the case of graph isomorphism problem.
alpar@1
   152
alpar@1
   153
After having provided the description of VF2++, in order to evaluate
alpar@1
   154
its effectiveness, an extensive comparison to the contemporary other
alpar@1
   155
algorithms is shown, using a wide range of inputs, including both real
alpar@1
   156
life biological and chemical datasets and standard randomly generated
alpar@1
   157
graph series.
alpar@1
   158
alpar@1
   159
The work was motivated and sponsored by QuantumBio Inc., and all the
alpar@1
   160
developed algorithms are available as the part of the open source
alpar@1
   161
LEMON graph and network optimization library
alpar@1
   162
(http://lemon.cs.elte.hu).
alpar@0
   163
\end{abstract}
alpar@0
   164
alpar@0
   165
\begin{keyword}
alpar@0
   166
%% keywords here, in the form: keyword \sep keyword
alpar@0
   167
alpar@0
   168
%% PACS codes here, in the form: \PACS code \sep code
alpar@0
   169
alpar@0
   170
%% MSC codes here, in the form: \MSC code \sep code
alpar@0
   171
%% or \MSC[2008] code \sep code (2000 is the default)
alpar@0
   172
alpar@0
   173
\end{keyword}
alpar@0
   174
alpar@0
   175
\end{frontmatter}
alpar@0
   176
alpar@0
   177
%% \linenumbers
alpar@0
   178
alpar@0
   179
%% main text
alpar@2
   180
\section{Introduction}
alpar@2
   181
\label{sec:intro}
alpar@2
   182
alpar@3
   183
In the last decades, combinatorial structures, and especially graphs
alpar@3
   184
have been considered with ever increasing interest, and applied to the
alpar@3
   185
solution of several new and revised questions.  The expressiveness,
alpar@3
   186
the simplicity and the studiedness of graphs make them practical for
alpar@3
   187
modelling and appear constantly in several seemingly independent
alpar@3
   188
fields.  Bioinformatics and chemistry are amongst the most relevant
alpar@3
   189
and most important fields.
alpar@2
   190
alpar@3
   191
Complex biological systems arise from the interaction and cooperation
alpar@3
   192
of plenty of molecular components. Getting acquainted with such
alpar@3
   193
systems at the molecular level has primary importance, since
alpar@3
   194
protein-protein interaction, DNA-protein interaction, metabolic
alpar@3
   195
interaction, transcription factor binding, neuronal networks, and
alpar@3
   196
hormone signaling networks can be understood only this way.
alpar@2
   197
alpar@3
   198
For instance, a molecular structure can be considered as a graph,
alpar@3
   199
whose nodes correspond to atoms and whose edges to chemical bonds. The
alpar@3
   200
secondary structure of a protein can also be represented as a graph,
alpar@3
   201
where nodes are associated with aminoacids and the edges with hydrogen
alpar@3
   202
bonds. The nodes are often whole molecular components and the edges
alpar@3
   203
represent some relationships among them.  The similarity and
alpar@3
   204
dissimilarity of objects corresponding to nodes are incorporated to
alpar@3
   205
the model by \emph{node labels}.  Many other chemical and biological
alpar@3
   206
structures can easily be modeled in a similar way. Understanding such
alpar@3
   207
networks basically requires finding specific subgraphs, which can not
alpar@3
   208
avoid the application of graph matching algorithms.
alpar@2
   209
alpar@3
   210
Finally, let some of the other real-world fields related to some
alpar@3
   211
variants of graph matching be briefly mentioned: pattern recognition
alpar@3
   212
and machine vision \cite{HorstBunkeApplications}, symbol recognition
alpar@3
   213
\cite{CordellaVentoSymbolRecognition}, face identification
alpar@3
   214
\cite{JianzhuangYongFaceIdentification}.  \\
alpar@2
   215
alpar@3
   216
Subgraph and induced subgraph matching problems are known to be
alpar@3
   217
NP-Complete\cite{SubgraphNPC}, while the graph isomorphism problem is
alpar@3
   218
one of the few problems in NP neither known to be in P nor
alpar@3
   219
NP-Complete. Although polynomial time isomorphism algorithms are known
alpar@3
   220
for various graph classes, like trees and planar
alpar@3
   221
graphs\cite{PlanarGraphIso}, bounded valence
alpar@3
   222
graphs\cite{BondedDegGraphIso}, interval graphs\cite{IntervalGraphIso}
alpar@3
   223
or permutation graphs\cite{PermGraphIso}.
alpar@2
   224
alpar@3
   225
In the following, some algorithms based on other approaches are
alpar@3
   226
summarized, which do not need any restrictions on the graphs. However,
alpar@3
   227
an overall polynomial behaviour is not expectable from such an
alpar@3
   228
alternative, it may often have good performance, even on a graph class
alpar@3
   229
for which polynomial algorithm is known. Note that this summary
alpar@3
   230
containing only exact matching algorithms is far not complete, neither
alpar@3
   231
does it cover all the recent algorithms.
alpar@2
   232
alpar@3
   233
The first practically usable approach was due to
alpar@4
   234
Ullmann\cite{Ullmann} which is a commonly used depth-first
alpar@3
   235
search based algorithm with a complex heuristic for reducing the
alpar@3
   236
number of visited states. A major problem is its $\Theta(n^3)$ space
alpar@3
   237
complexity, which makes it impractical in the case of big sparse
alpar@3
   238
graphs.
alpar@2
   239
alpar@4
   240
In a recent paper, Ullmann\cite{UllmannBit} presents an
alpar@3
   241
improved version of this algorithm based on a bit-vector solution for
alpar@3
   242
the binary Constraint Satisfaction Problem.
alpar@2
   243
alpar@4
   244
The Nauty algorithm\cite{Nauty} transforms the two graphs to
alpar@3
   245
a canonical form before starting to check for the isomorphism. It has
alpar@3
   246
been considered as one of the fastest graph isomorphism algorithms,
alpar@3
   247
although graph categories were shown in which it takes exponentially
alpar@3
   248
many steps. This algorithm handles only the graph isomorphism problem.
alpar@2
   249
alpar@4
   250
The \emph{LAD} algorithm\cite{Lad} uses a depth-first search
alpar@3
   251
strategy and formulates the matching as a Constraint Satisfaction
alpar@3
   252
Problem to prune the search tree. The constraints are that the mapping
alpar@3
   253
has to be injective and edge-preserving, hence it is possible to
alpar@3
   254
handle new matching types as well.
alpar@2
   255
alpar@3
   256
The \textbf{RI} algorithm\cite{RI} and its variations are based on a
alpar@3
   257
state space representation. After reordering the nodes of the graphs,
alpar@3
   258
it uses some fast executable heuristic checks without using any
alpar@3
   259
complex pruning rules. It seems to run really efficiently on graphs
alpar@3
   260
coming from biology, and won the International Contest on Pattern
alpar@3
   261
Search in Biological Databases\cite{Content}.
alpar@2
   262
alpar@3
   263
The currently most commonly used algorithm is the
alpar@3
   264
\textbf{VF2}\cite{VF2}, the improved version of VF\cite{VF}, which was
alpar@3
   265
designed for solving pattern matching and computer vision problems,
alpar@3
   266
and has been one of the best overall algorithms for more than a
alpar@3
   267
decade. Although, it can't be up to new specialized algorithms, it is
alpar@3
   268
still widely used due to its simplicity and space efficiency. VF2 uses
alpar@3
   269
a state space representation and checks some conditions in each state
alpar@3
   270
to prune the search tree.
alpar@2
   271
alpar@3
   272
Our first graph matching algorithm was the first version of VF2 which
alpar@3
   273
recognizes the significance of the node ordering, more opportunities
alpar@3
   274
to increase the cutting efficiency and reduce its computational
alpar@3
   275
complexity. This project was initiated and sponsored by QuantumBio
alpar@3
   276
Inc.\cite{QUANTUMBIO} and the implementation --- along with a source
alpar@3
   277
code --- has been published as a part of LEMON\cite{LEMON} open source
alpar@3
   278
graph library.
alpar@2
   279
alpar@3
   280
This paper introduces \textbf{VF2++}, a new further improved algorithm
alpar@3
   281
for the graph and (induced)subgraph isomorphism problem, which uses
alpar@3
   282
efficient cutting rules and determines a node order in which VF2 runs
alpar@3
   283
significantly faster on practical inputs.
alpar@2
   284
alpar@3
   285
Meanwhile, another variant called \textbf{VF2 Plus}\cite{VF2Plus} has
alpar@3
   286
been published. It is considered to be as efficient as the RI
alpar@3
   287
algorithm and has a strictly better behavior on large graphs.  The
alpar@3
   288
main idea of VF2 Plus is to precompute a heuristic node order of the
alpar@3
   289
small graph, in which the VF2 works more efficiently.
alpar@2
   290
alpar@2
   291
\section{Problem Statement}
alpar@3
   292
This section provides a detailed description of the problems to be
alpar@3
   293
solved.
alpar@2
   294
\subsection{Definitions}
alpar@2
   295
alpar@3
   296
Throughout the paper $G_{small}=(V_{small}, E_{small})$ and
alpar@3
   297
$G_{large}=(V_{large}, E_{large})$ denote two undirected graphs.
alpar@2
   298
\begin{definition}\label{sec:ismorphic}
alpar@3
   299
$G_{small}$ and $G_{large}$ are \textbf{isomorphic} if $\exists M:
alpar@3
   300
  V_{small} \longrightarrow V_{large}$ bijection, for which the
alpar@3
   301
  following is true:
alpar@2
   302
\begin{center}
alpar@3
   303
$\forall u,v\in{V_{small}} : (u,v)\in{E_{small}} \Leftrightarrow
alpar@3
   304
  (M(u),M(v))\in{E_{large}}$
alpar@2
   305
\end{center}
alpar@2
   306
\end{definition}
alpar@3
   307
For the sake of simplicity in this paper subgraphs and induced
alpar@3
   308
subgraphs are defined in a more general way than usual:
alpar@2
   309
\begin{definition}
alpar@3
   310
$G_{small}$ is a \textbf{subgraph} of $G_{large}$ if $\exists I:
alpar@3
   311
  V_{small}\longrightarrow V_{large}$ injection, for which the
alpar@3
   312
  following is true:
alpar@2
   313
\begin{center}
alpar@2
   314
$\forall u,v \in{V_{small}} : (u,v)\in{E_{small}} \Rightarrow (I(u),I(v))\in E_{large}$
alpar@2
   315
\end{center}
alpar@2
   316
\end{definition}
alpar@2
   317
alpar@2
   318
\begin{definition} 
alpar@3
   319
$G_{small}$ is an \textbf{induced subgraph} of $G_{large}$ if $\exists
alpar@3
   320
  I: V_{small}\longrightarrow V_{large}$ injection, for which the
alpar@3
   321
  following is true:
alpar@2
   322
\begin{center}
alpar@3
   323
$\forall u,v \in{V_{small}} : (u,v)\in{E_{small}} \Leftrightarrow
alpar@3
   324
  (I(u),I(v))\in E_{large}$
alpar@2
   325
\end{center}
alpar@2
   326
\end{definition}
alpar@2
   327
alpar@2
   328
\begin{definition}
alpar@3
   329
$lab: (V_{small}\cup V_{large}) \longrightarrow K$ is a \textbf{node
alpar@3
   330
    label function}, where K is an arbitrary set. The elements in K
alpar@3
   331
  are the \textbf{node labels}. Two nodes, u and v are said to be
alpar@3
   332
  \textbf{equivalent}, if $lab(u)=lab(v)$.
alpar@2
   333
\end{definition}
alpar@2
   334
alpar@3
   335
When node labels are also given, the matched nodes must have the same
alpar@3
   336
labels.  For example, the node labeled isomorphism is phrased by
alpar@2
   337
\begin{definition}
alpar@3
   338
$G_{small}$ and $G_{large}$ are \textbf{isomorphic by the node label
alpar@3
   339
    function lab} if $\exists M: V_{small} \longrightarrow V_{large}$
alpar@3
   340
  bijection, for which the following is true:
alpar@2
   341
\begin{center}
alpar@3
   342
$(\forall u,v\in{V_{small}} : (u,v)\in{E_{small}} \Leftrightarrow
alpar@3
   343
  (M(u),M(v))\in{E_{large}})$ and $(\forall u\in{V_{small}} :
alpar@3
   344
  lab(u)=lab(M(u)))$
alpar@2
   345
\end{center}
alpar@2
   346
\end{definition}
alpar@2
   347
alpar@2
   348
The other two definitions can be extended in the same way.
alpar@2
   349
alpar@3
   350
Note that edge label function can be defined similarly to node label
alpar@3
   351
function, and all the definitions can be extended with additional
alpar@3
   352
conditions, but it is out of the scope of this work.
alpar@2
   353
alpar@3
   354
The equivalence of two nodes is usually defined by another relation,
alpar@3
   355
$\\R\subseteq (V_{small}\cup V_{large})^2$. This overlaps with the
alpar@3
   356
definition given above if R is an equivalence relation, which does not
alpar@3
   357
mean restriction in biological and chemical applications.
alpar@2
   358
alpar@2
   359
\subsection{Common problems}\label{sec:CommProb}
alpar@2
   360
alpar@3
   361
The focus of this paper is on two extensively studied topics, the
alpar@3
   362
subgraph isomorphism and its variations. However, the following
alpar@3
   363
problems also appear in many applications.
alpar@2
   364
alpar@3
   365
The \textbf{subgraph matching problem} is the following: is
alpar@3
   366
$G_{small}$ isomorphic to any subgraph of $G_{large}$ by a given node
alpar@3
   367
label?
alpar@2
   368
alpar@3
   369
The \textbf{induced subgraph matching problem} asks the same about the
alpar@3
   370
existence of an induced subgraph.
alpar@2
   371
alpar@3
   372
The \textbf{graph isomorphism problem} can be defined as induced
alpar@3
   373
subgraph matching problem where the sizes of the two graphs are equal.
alpar@2
   374
alpar@3
   375
In addition to existence, it may be needed to show such a subgraph, or
alpar@3
   376
it may be necessary to list all of them.
alpar@2
   377
alpar@3
   378
It should be noted that some authors misleadingly refer to the term
alpar@3
   379
\emph{subgraph isomorphism problem} as an \emph{induced subgraph
alpar@3
   380
  isomorphism problem}.
alpar@2
   381
alpar@3
   382
The following sections give the descriptions of VF2, VF2++, VF2 Plus
alpar@3
   383
and a particular comparison.
alpar@2
   384
alpar@2
   385
\section{The VF2 Algorithm}
alpar@3
   386
This algorithm is the basis of both the VF2++ and the VF2 Plus.  VF2
alpar@4
   387
is able to handle all the variations mentioned in Section
alpar@4
   388
  \ref{sec:CommProb}.  Although it can also handle directed graphs,
alpar@3
   389
for the sake of simplicity, only the undirected case will be
alpar@3
   390
discussed.
alpar@2
   391
alpar@2
   392
alpar@2
   393
\subsection{Common notations}
alpar@3
   394
\indent Assume $G_{small}$ is searched in $G_{large}$.  The following
alpar@3
   395
definitions and notations will be used throughout the whole paper.
alpar@2
   396
\begin{definition}
alpar@3
   397
A set $M\subseteq V_{small}\times V_{large}$ is called
alpar@3
   398
\textbf{mapping}, if no node of $V_{small}$ or of $V_{large}$ appears
alpar@3
   399
in more than one pair in M.  That is, M uniquely associates some of
alpar@3
   400
the nodes in $V_{small}$ with some nodes of $V_{large}$ and vice
alpar@3
   401
versa.
alpar@2
   402
\end{definition}
alpar@2
   403
alpar@2
   404
\begin{definition}
alpar@3
   405
Mapping M \textbf{covers} a node v, if there exists a pair in M, which
alpar@3
   406
contains v.
alpar@2
   407
\end{definition}
alpar@2
   408
alpar@2
   409
\begin{definition}
alpar@3
   410
A mapping $M$ is $\mathbf{whole\ mapping}$, if $M$ covers all the
alpar@3
   411
nodes in $V_{small}$.
alpar@2
   412
\end{definition}
alpar@2
   413
alpar@2
   414
\begin{notation}
alpar@3
   415
Let $\mathbf{M_{small}(s)} := \{u\in V_{small} : \exists v\in
alpar@3
   416
V_{large}: (u,v)\in M(s)\}$ and $\mathbf{M_{large}(s)} := \{v\in
alpar@3
   417
V_{large} : \exists u\in V_{small}: (u,v)\in M(s)\}$.
alpar@2
   418
\end{notation}
alpar@2
   419
alpar@2
   420
\begin{notation}
alpar@3
   421
Let $\mathbf{Pair(M,v)}$ be the pair of $v$ in $M$, if such a node
alpar@3
   422
exist, otherwise $\mathbf{Pair(M,v)}$ is undefined. For a mapping $M$
alpar@3
   423
and $v\in V_{small}\cup V_{large}$.
alpar@2
   424
\end{notation}
alpar@2
   425
alpar@2
   426
Note that if $\mathbf{Pair(M,v)}$ exists, then it is unique
alpar@2
   427
alpar@3
   428
The definitions of the isomorphism types can be rephrased on the
alpar@3
   429
existence of a special whole mapping $M$, since it represents a
alpar@3
   430
bijection. For example
alpar@2
   431
\begin{center}
alpar@3
   432
$M\subseteq V_{small}\times V_{large}$ represents an induced subgraph
alpar@3
   433
  isomorphism $\Leftrightarrow$ $M$ is whole mapping and $\forall u,v
alpar@3
   434
  \in{V_{small}} : (u,v)\in{E_{small}} \Leftrightarrow
alpar@3
   435
  (Pair(M,u),Pair(M,v))\in E_{large}$.
alpar@2
   436
\end{center}
alpar@2
   437
alpar@2
   438
\begin{definition}
alpar@2
   439
A set of whole mappings is called \textbf{problem type}.
alpar@2
   440
\end{definition}
alpar@3
   441
Throughout the paper, $\mathbf{PT}$ denotes a generic problem type
alpar@3
   442
which can be substituted by any problem type.
alpar@2
   443
alpar@3
   444
A whole mapping $W\mathbf{\ is\ of\ type\ PT}$, if $W\in PT$. Using
alpar@3
   445
this notations, VF2 searches a whole mapping $W$ of type $PT$.
alpar@2
   446
alpar@3
   447
For example the problem type of graph isomorphism problem is the
alpar@3
   448
following.  A whole mapping $W$ is in $\mathbf{ISO}$, iff the
alpar@4
   449
bijection represented by $W$ satisfies Definition~\ref{sec:ismorphic}.
alpar@4
   450
The subgraph- and induced subgraph matching problems can be formalized
alpar@4
   451
in a similar way. Let their problem types be denoted as $\mathbf{SUB}$
alpar@4
   452
and $\mathbf{IND}$.
alpar@2
   453
alpar@2
   454
\begin{definition}
alpar@2
   455
\label{expPT}
alpar@3
   456
$PT$ is an \textbf{expanding problem type} if $\ \forall\ W\in
alpar@3
   457
PT:\ \forall u_1,u_2\in V_{small}:\ (u_1,u_2)\in E_{small}\Rightarrow
alpar@3
   458
(Pair(W,u_1),Pair(W,u_2))\in E_{large}$, that is each edge of
alpar@3
   459
$G_{small}$ has to be mapped to an edge of $G_{large}$ for each
alpar@3
   460
mapping in $PT$.
alpar@2
   461
\end{definition}
alpar@2
   462
alpar@2
   463
Note that $ISO$, $SUB$ and $IND$ are expanding problem types.
alpar@2
   464
alpar@3
   465
This paper deals with the three problem types mentioned above only,
alpar@3
   466
but the following generic definitions make it possible to handle other
alpar@3
   467
types as well.  Although it may be challenging to find a proper
alpar@3
   468
consistency function and an efficient cutting function.
alpar@2
   469
alpar@2
   470
\begin{definition}
alpar@3
   471
Let M be a mapping. A logical function $\mathbf{Cons_{PT}}$ is a
alpar@3
   472
\textbf{consistency function by } $\mathbf{PT}$, if the following
alpar@3
   473
holds. If there exists whole mapping $W$ of type PT for which
alpar@3
   474
$M\subseteq W$, then $Cons_{PT}(M)$ is true.
alpar@2
   475
\end{definition}
alpar@2
   476
alpar@2
   477
\begin{definition} 
alpar@3
   478
Let M be a mapping. A logical function $\mathbf{Cut_{PT}}$ is a
alpar@3
   479
\textbf{cutting function by } $\mathbf{PT}$, if the following
alpar@3
   480
holds. $\mathbf{Cut_{PT}(M)}$ is false if $M$ can be extended to a
alpar@3
   481
whole mapping W of type PT.
alpar@2
   482
\end{definition}
alpar@2
   483
alpar@2
   484
\begin{definition}
alpar@3
   485
$M$ is said to be \textbf{consistent mapping by} $\mathbf{PT}$, if
alpar@3
   486
  $Cons_{PT}(M)$ is true.
alpar@2
   487
\end{definition}
alpar@2
   488
alpar@2
   489
$Cons_{PT}$ and $Cut_{PT}$ will often be used in the following form.
alpar@2
   490
\begin{notation}
alpar@3
   491
Let $\mathbf{Cons_{PT}(p, M)}:=Cons_{PT}(M\cup\{p\})$ and
alpar@3
   492
$\mathbf{Cut_{PT}(p, M)}:=Cut_{PT}(M\cup\{p\})$, where
alpar@3
   493
$p\in{V_{small}\!\times\!V_{large}}$ and $M\cup\{p\}$ is mapping.
alpar@2
   494
\end{notation}
alpar@2
   495
alpar@3
   496
$Cons_{PT}$ will be used to check the consistency of the already
alpar@3
   497
covered nodes, while $Cut_{PT}$ is for looking ahead to recognize if
alpar@3
   498
no whole consistent mapping can contain the current mapping.
alpar@2
   499
alpar@2
   500
\subsection{Overview of the algorithm}
alpar@3
   501
VF2 uses a state space representation of mappings, $Cons_{PT}$ for
alpar@3
   502
excluding inconsistency with the problem type and $Cut_{PT}$ for
alpar@3
   503
pruning the search tree.  Each state $s$ of the matching process can
alpar@3
   504
be associated with a mapping $M(s)$.
alpar@2
   505
alpar@4
   506
Algorithm~\ref{alg:VF2Pseu} is a high level description of
alpar@3
   507
the VF2 matching algorithm.
alpar@2
   508
alpar@2
   509
alpar@2
   510
\begin{algorithm}
alpar@3
   511
\algtext*{EndIf}%ne nyomtasson end if-et \algtext*{EndFor}%ne
alpar@3
   512
nyomtasson ..  \algtext*{EndProcedure}%ne nyomtasson ..
alpar@2
   513
\caption{\hspace{0.5cm}$A\ high\ level\ description\ of\ VF2$}\label{alg:VF2Pseu}
alpar@2
   514
\begin{algorithmic}[1]
alpar@2
   515
alpar@3
   516
\Procedure{VF2}{State $s$, ProblemType $PT$} \If{$M(s$) covers
alpar@3
   517
  $V_{small}$} \State Output($M(s)$) \Else
alpar@2
   518
  
alpar@3
   519
  \State Compute the set $P(s)$ of the pairs candidate for inclusion
alpar@3
   520
  in $M(s)$ \ForAll{$p\in{P(s)}$} \If{Cons$_{PT}$($p, M(s)$) $\wedge$
alpar@3
   521
    $\neg$Cut$_{PT}$($p, M(s)$)} \State Compute the nascent state
alpar@3
   522
  $\tilde{s}$ by adding $p$ to $M(s)$ \State \textbf{call}
alpar@3
   523
  VF2($\tilde{s}$, $PT$) \EndIf \EndFor \EndIf \EndProcedure
alpar@2
   524
\end{algorithmic}
alpar@2
   525
\end{algorithm}
alpar@2
   526
alpar@2
   527
alpar@3
   528
The initial state $s_0$ is associated with $M(s_0)=\emptyset$, i.e. it
alpar@3
   529
starts with an empty mapping.
alpar@2
   530
alpar@3
   531
For each state $s$, the algorithm computes $P(s)$, the set of
alpar@3
   532
candidate node pairs for adding to the current state $s$.
alpar@2
   533
alpar@3
   534
For each pair $p$ in $P(s)$, $Cons_{PT}(p,M(s))$ and
alpar@3
   535
$Cut_{PT}(p,M(s))$ are evaluated. If $Cons_{PT}(p,M(s))$ is true and
alpar@3
   536
$Cut_{PT}(p,M(s))$ is false, the successor state $\tilde{s}=s\cup
alpar@3
   537
\{p\}$ is computed, and the whole process is recursively applied to
alpar@3
   538
$\tilde{s}$. Otherwise, $\tilde{s}$ is not consistent by $PT$ or it
alpar@3
   539
can be proved that $s$ can not be extended to a whole mapping.
alpar@2
   540
alpar@2
   541
In order to make sure of the correctness see
alpar@2
   542
\begin{claim}
alpar@3
   543
Through consistent mappings, only consistent whole mappings can be
alpar@3
   544
reached, and all of the whole mappings are reachable through
alpar@3
   545
consistent mappings.
alpar@2
   546
\end{claim}
alpar@2
   547
alpar@3
   548
Note that a state may be reached in many different ways, since the
alpar@3
   549
order of insertions into M does not influence the nascent mapping. In
alpar@3
   550
fact, the number of different ways which lead to the same state can be
alpar@3
   551
exponentially large. If $G_{small}$ and $G_{large}$ are circles with n
alpar@3
   552
nodes and n different node labels, there exists exactly one graph
alpar@3
   553
isomorphism between them, but it will be reached in $n!$ different
alpar@3
   554
ways.
alpar@2
   555
alpar@2
   556
However, one may observe
alpar@2
   557
alpar@2
   558
\begin{claim}
alpar@2
   559
\label{claim:claimTotOrd}
alpar@3
   560
Let $\prec$ an arbitrary total ordering relation on $V_{small}$.  If
alpar@3
   561
the algorithm ignores each $p=(u,v) \in P(s)$, for which
alpar@2
   562
\begin{center}
alpar@2
   563
$\exists (\hat{u},\hat{v})\in P(s): \hat{u} \prec u$,
alpar@2
   564
\end{center}
alpar@3
   565
then no state can be reached more than ones and each state associated
alpar@3
   566
with a whole mapping remains reachable.
alpar@2
   567
\end{claim}
alpar@2
   568
alpar@3
   569
Note that the cornerstone of the improvements to VF2 is a proper
alpar@3
   570
choice of a total ordering.
alpar@2
   571
alpar@2
   572
\subsection{The candidate set P(s)}
alpar@2
   573
\label{candidateComputingVF2}
alpar@2
   574
$P(s)$ is the set of the candidate pairs for inclusion in $M(s)$.
alpar@3
   575
Suppose that $PT$ is an expanding problem type, see
alpar@4
   576
Definition~\ref{expPT}.
alpar@2
   577
alpar@2
   578
\begin{notation}
alpar@3
   579
Let $\mathbf{T_{small}(s)}:=\{u \in V_{small} : u$ is not covered by
alpar@3
   580
$M(s)\wedge\exists \tilde{u}\in{V_{small}: (u,\tilde{u})\in E_{small}}
alpar@3
   581
\wedge \tilde{u}$ is covered by $M(s)\}$, and
alpar@3
   582
\\ $\mathbf{T_{large}(s)}\!:=\!\{v \in\!V_{large}\!:\!v$ is not
alpar@3
   583
covered by
alpar@3
   584
$M(s)\wedge\!\exists\tilde{v}\!\in\!{V_{large}\!:\!(v,\tilde{v})\in\!E_{large}}
alpar@3
   585
\wedge \tilde{v}$ is covered by $M(s)\}$
alpar@2
   586
\end{notation}
alpar@2
   587
alpar@3
   588
The set $P(s)$ includes the pairs of uncovered neighbours of covered
alpar@3
   589
nodes and if there is not such a node pair, all the pairs containing
alpar@3
   590
two uncovered nodes are added. Formally, let
alpar@2
   591
\[
alpar@2
   592
 P(s)\!=\!
alpar@2
   593
  \begin{cases} 
alpar@3
   594
   T_{small}(s)\times T_{large}(s)&\hspace{-0.15cm}\text{if }
alpar@3
   595
   T_{small}(s)\!\neq\!\emptyset\!\wedge\!T_{large}(s)\!\neq
alpar@3
   596
   \emptyset,\\ (V_{small}\!\setminus\!M_{small}(s))\!\times\!(V_{large}\!\setminus\!M_{large}(s))
alpar@3
   597
   &\hspace{-0.15cm}otherwise.
alpar@2
   598
  \end{cases}
alpar@2
   599
\]
alpar@2
   600
alpar@2
   601
\subsection{Consistency}
alpar@3
   602
This section defines the consistency functions for the different
alpar@4
   603
problem types mentioned in Section~\ref{sec:CommProb}.
alpar@2
   604
\begin{notation}
alpar@3
   605
Let $\mathbf{\Gamma_{small} (u)}:=\{\tilde{u}\in V_{small} :
alpar@3
   606
(u,\tilde{u})\in E_{small}\}$\\ Let $\mathbf{\Gamma_{large}
alpar@3
   607
  (v)}:=\{\tilde{v}\in V_{large} : (v,\tilde{v})\in E_{large}\}$
alpar@2
   608
\end{notation}
alpar@3
   609
Suppose $p=(u,v)$, where $u\in V_{small}$ and $v\in V_{large}$, $s$ is
alpar@3
   610
a state of the matching procedure, $M(s)$ is consistent mapping by
alpar@3
   611
$PT$ and $lab(u)=lab(v)$.  $Cons_{PT}(p,M(s))$ checks whether
alpar@3
   612
including pair $p$ into $M(s)$ leads to a consistent mapping by $PT$.
alpar@2
   613
alpar@2
   614
\subsubsection{Induced subgraph isomorphism}
alpar@3
   615
$M(s)\cup \{(u,v)\}$ is a consistent mapping by $IND$ $\Leftrightarrow
alpar@3
   616
(\forall \tilde{u}\in M_{small}: (u,\tilde{u})\in E_{small}
alpar@3
   617
\Leftrightarrow (v,Pair(M(s),\tilde{u}))\in E_{large})$.\newline The
alpar@3
   618
following formulation gives an efficient way of calculating
alpar@3
   619
$Cons_{IND}$.
alpar@2
   620
\begin{claim}
alpar@3
   621
$Cons_{IND}((u,v),M(s)):=(\forall \tilde{v}\in \Gamma_{large}(v)
alpar@3
   622
  \ \cap\ M_{large}(s):\\(Pair(M(s),\tilde{v}),u)\in E_{small})\wedge
alpar@3
   623
  (\forall \tilde{u}\in \Gamma_{small}(u)
alpar@3
   624
  \ \cap\ M_{small}(s):(v,Pair(M(s),\tilde{u}))\in E_{large})$ is a
alpar@3
   625
  consistency function in the case of $IND$.
alpar@2
   626
\end{claim}
alpar@2
   627
alpar@2
   628
\subsubsection{Graph isomorphism}
alpar@3
   629
$M(s)\cup \{(u,v)\}$ is a consistent mapping by $ISO$
alpar@3
   630
$\Leftrightarrow$ $M(s)\cup \{(u,v)\}$ is a consistent mapping by
alpar@3
   631
$IND$.
alpar@2
   632
\begin{claim}
alpar@3
   633
$Cons_{ISO}((u,v),M(s))$ is a consistency function by $ISO$ if and
alpar@3
   634
  only if it is a consistency function by $IND$.
alpar@2
   635
\end{claim}
alpar@2
   636
\subsubsection{Subgraph isomorphism}
alpar@3
   637
$M(s)\cup \{(u,v)\}$ is a consistent mapping by $SUB$ $\Leftrightarrow
alpar@3
   638
(\forall \tilde{u}\in M_{small}:\\(u,\tilde{u})\in E_{small}
alpar@3
   639
\Rightarrow (v,Pair(M(s),\tilde{u}))\in E_{large})$.
alpar@2
   640
\newline
alpar@3
   641
The following formulation gives an efficient way of calculating
alpar@3
   642
$Cons_{SUB}$.
alpar@2
   643
\begin{claim}
alpar@3
   644
$Cons_{SUB}((u,v),M(s)):= (\forall \tilde{u}\in \Gamma_{small}(u)
alpar@3
   645
  \ \cap\ M_{small}(s):\\(v,Pair(M(s),\tilde{u}))\in E_{large})$ is a
alpar@3
   646
  consistency function by $SUB$.
alpar@2
   647
\end{claim}
alpar@2
   648
alpar@2
   649
\subsection{Cutting rules}
alpar@3
   650
$Cut_{PT}(p,M(s))$ is defined by a collection of efficiently
alpar@3
   651
verifiable conditions. The requirement is that $Cut_{PT}(p,M(s))$ can
alpar@3
   652
be true only if it is impossible to extended $M(s)\cup \{p\}$ to a
alpar@3
   653
whole mapping.
alpar@2
   654
\begin{notation}
alpar@2
   655
alpar@3
   656
Let $\mathbf{\tilde{T}_{small}}(s):=(V_{small}\backslash
alpar@3
   657
M_{small}(s))\backslash T_{small}(s)$, and
alpar@3
   658
\\ $\mathbf{\tilde{T}_{large}}(s):=(V_{large}\backslash
alpar@3
   659
M_{large}(s))\backslash T_{large}(s)$.
alpar@2
   660
\end{notation}
alpar@2
   661
\subsubsection{Induced subgraph isomorphism}
alpar@2
   662
\begin{claim}
alpar@3
   663
$Cut_{IND}((u,v),M(s)):= |\Gamma_{large} (v)\ \cap\ T_{large}(s)| <
alpar@3
   664
  |\Gamma_{small} (u)\ \cap\ T_{small}(s)| \vee |\Gamma_{large}(v)\cap
alpar@3
   665
  \tilde{T}_{large}(s)| < |\Gamma_{small}(u)\cap
alpar@3
   666
  \tilde{T}_{small}(s)|$ is a cutting function by $IND$.
alpar@2
   667
\end{claim}
alpar@2
   668
\subsubsection{Graph isomorphism}
alpar@3
   669
Note that the cutting function of induced subgraph isomorphism defined
alpar@3
   670
above is a cutting function by $ISO$, too, however it is less
alpar@3
   671
efficient than the following while their computational complexity is
alpar@3
   672
the same.
alpar@2
   673
\begin{claim}
alpar@3
   674
$Cut_{ISO}((u,v),M(s)):= |\Gamma_{large} (v)\ \cap\ T_{large}(s)| \neq
alpar@3
   675
  |\Gamma_{small} (u)\ \cap\ T_{small}(s)| \vee |\Gamma_{large}(v)\cap
alpar@3
   676
  \tilde{T}_{large}(s)| \neq |\Gamma_{small}(u)\cap
alpar@3
   677
  \tilde{T}_{small}(s)|$ is a cutting function by $ISO$.
alpar@2
   678
\end{claim}
alpar@2
   679
alpar@2
   680
\subsubsection{Subgraph isomorphism}
alpar@2
   681
\begin{claim}
alpar@3
   682
$Cut_{SUB}((u,v),M(s)):= |\Gamma_{large} (v)\ \cap\ T_{large}(s)| <
alpar@3
   683
  |\Gamma_{small} (u)\ \cap\ T_{small}(s)|$ is a cutting function by
alpar@3
   684
  $SUB$.
alpar@2
   685
\end{claim}
alpar@3
   686
Note that there is a significant difference between induced and
alpar@3
   687
non-induced subgraph isomorphism:
alpar@2
   688
alpar@2
   689
\begin{claim}
alpar@2
   690
\label{claimSUB}
alpar@3
   691
$Cut_{SUB}'((u,v),M(s)):= |\Gamma_{large} (v)\ \cap\ T_{large}(s)| <
alpar@3
   692
|\Gamma_{small} (u)\ \cap\ T_{small}(s)| \vee |\Gamma_{large}(v)\cap
alpar@3
   693
\tilde{T}_{large}(s)| < |\Gamma_{small}(u)\cap \tilde{T}_{small}(s)|$
alpar@3
   694
is \textbf{not} a cutting function by $SUB$.
alpar@2
   695
\end{claim}
alpar@2
   696
\begin{proof}$ $\\
alpar@2
   697
\vspace*{-0.5cm}
alpar@2
   698
alpar@2
   699
\begin{figure}
alpar@2
   700
\begin{center}
alpar@2
   701
\begin{tikzpicture}
alpar@2
   702
  [scale=.8,auto=left,every node/.style={circle,fill=black!15}]
alpar@3
   703
  \node[rectangle,fill=black!15] at (4,6) {$G_{small}$}; \node (u4) at
alpar@3
   704
  (2.5,10) {$u_4$}; \node (u3) at (5.5,10) {$u_3$}; \node (u1) at
alpar@3
   705
  (2.5,7) {$u_1$}; \node (u2) at (5.5,7) {$u_2$};
alpar@2
   706
  
alpar@2
   707
  \node[rectangle,fill=black!30] at (13.5,6) {$G_{large}$};
alpar@3
   708
  \node[fill=black!30] (v4) at (12,10) {$v_4$}; \node[fill=black!30]
alpar@3
   709
  (v3) at (15,10) {$v_3$}; \node[fill=black!30] (v1) at (12,7)
alpar@3
   710
       {$v_1$}; \node[fill=black!30] (v2) at (15,7) {$v_2$};
alpar@2
   711
  
alpar@2
   712
alpar@3
   713
  \foreach \from/\to in {u1/u2,u2/u3,u3/u4,u4/u1} \draw (\from) --
alpar@3
   714
  (\to); \foreach \from/\to in {v1/v2,v2/v3,v3/v4,v4/v1,v1/v3} \draw
alpar@3
   715
  (\from) -- (\to);
alpar@2
   716
%    \draw[dashed] (\from) -- (\to);
alpar@2
   717
\end{tikzpicture}
alpar@4
   718
\caption{Graphs for the proof of Claim~\ref{claimSUB}}
alpar@4
   719
\label{fig:proofSUB}
alpar@2
   720
\end{center}
alpar@2
   721
\end{figure}
alpar@4
   722
Let the two graphs of Figure~\ref{fig:proofSUB} be the input
alpar@3
   723
graphs.  Suppose the total ordering relation is $u_1 \prec u_2 \prec
alpar@3
   724
u_3 \prec u_4$,$M(s)\!=\{(u_1,v_1)\}$, and VF2 tries to add
alpar@3
   725
$(u_2,v_2)\in P(s)$.\newline $Cons_{SUB}((u_2,v_2),M(s))=true$, so
alpar@3
   726
$M\cup \{(u_2,v_2)\}$ is consistent by $SUB$. The cutting function
alpar@3
   727
$Cut_{SUB}((u_2,v_2),M(s))$ is false, so it does not let cut the
alpar@3
   728
tree.\newline On the other hand $Cut_{SUB}'((u_2,v_2),M(s))$ is true,
alpar@3
   729
since\\$0=|\Gamma_{large}(v_2)\cap
alpar@3
   730
\tilde{T}_{large}(s)|<|\Gamma_{small}(u_2)\cap
alpar@3
   731
\tilde{T}_{small}(s)|=1$ is true, but still the tree can not be
alpar@3
   732
pruned, because otherwise the
alpar@3
   733
$\{(u_1,v_1)(u_2,v_2)(u_3,v_3)(u_4,v_4)\}$ mapping can not be found.
alpar@2
   734
\end{proof}
alpar@2
   735
alpar@2
   736
\section{The VF2++ Algorithm}
alpar@3
   737
Although any total ordering relation makes the search space of VF2 a
alpar@3
   738
tree, its choice turns out to dramatically influence the number of
alpar@3
   739
visited states. The goal is to determine an efficient one as quickly
alpar@3
   740
as possible.
alpar@2
   741
alpar@3
   742
The main reason for VF2++' superiority over VF2 is twofold. Firstly,
alpar@3
   743
taking into account the structure and the node labeling of the graph,
alpar@3
   744
VF2++ determines a state order in which most of the unfruitful
alpar@3
   745
branches of the search space can be pruned immediately. Secondly,
alpar@3
   746
introducing more efficient --- nevertheless still easier to compute
alpar@3
   747
--- cutting rules reduces the chance of going astray even further.
alpar@2
   748
alpar@3
   749
In addition to the usual subgraph isomorphism, specialized versions
alpar@3
   750
for induced subgraph isomorphism and for graph isomorphism have been
alpar@3
   751
designed. VF2++ has gained a runtime improvement of one order of
alpar@3
   752
magnitude respecting induced subgraph isomorphism and a better
alpar@3
   753
asymptotical behaviour in the case of graph isomorphism problem.
alpar@2
   754
alpar@3
   755
Note that a weaker version of the cutting rules and the more efficient
alpar@3
   756
candidate set calculating were described in \cite{VF2Plus}, too.
alpar@2
   757
alpar@3
   758
It should be noted that all the methods described in this section are
alpar@3
   759
extendable to handle directed graphs and edge labels as well.
alpar@2
   760
alpar@3
   761
The basic ideas and the detailed description of VF2++ are provided in
alpar@3
   762
the following.
alpar@2
   763
alpar@2
   764
\subsection{Preparations}
alpar@2
   765
\begin{claim}
alpar@2
   766
\label{claim:claimCoverFromLeft}
alpar@3
   767
The total ordering relation uniquely determines a node order, in which
alpar@3
   768
the nodes of $V_{small}$ will be covered by VF2. From the point of
alpar@3
   769
view of the matching procedure, this means, that always the same node
alpar@3
   770
of $G_{small}$ will be covered on the d-th level.
alpar@2
   771
\end{claim}
alpar@2
   772
\begin{proof}
alpar@3
   773
In order to make the search space a tree, the pairs in $\{(u,v)\in
alpar@3
   774
P(s) : \exists \hat{u} : \hat{u}\prec u\}$ are excluded from $P(s)$.
alpar@2
   775
\newline
alpar@3
   776
Let $\tilde{P}(s):=P(s)\backslash \{(u,v)\in P(s) : \exists \hat{u} :
alpar@3
   777
\hat{u}\prec u\}$
alpar@2
   778
\newline
alpar@3
   779
The relation $\prec$ is a total ordering, so $\exists!\ \tilde{u} :
alpar@3
   780
\forall\ (u,v)\in \tilde{P}(s): u=\tilde u$. Since a pair form
alpar@3
   781
$\tilde{P}(s)$ is chosen for including into $M(s)$, it is obvious,
alpar@3
   782
that only $\tilde{u}$ can be covered in $V_{small}$. Actually,
alpar@3
   783
$\tilde{u}$ is the smallest element in $T_{small}(s)$ (or in
alpar@3
   784
$V_{small}\backslash M_{small}(s)$, if $T_{small}(s)$ were empty), and
alpar@3
   785
$T_{small}(s)$ depends only on the covered nodes of $G_{small}$.
alpar@2
   786
\newline
alpar@3
   787
Simple induction on $d$ shows that the set of covered nodes of
alpar@3
   788
$G_{small}$ is unique, if $d$ is given, so $\tilde{u}$ is unique if
alpar@3
   789
$d$ is given.
alpar@2
   790
\end{proof}
alpar@2
   791
alpar@2
   792
\begin{definition}
alpar@3
   793
An order $(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(|V_{small}|)})$ of
alpar@3
   794
$V_{small}$ is \textbf{matching order}, if exists $\prec$ total
alpar@3
   795
ordering relation, s.t. the VF2 with $\prec$ on the d-th level finds
alpar@3
   796
pair for $u_{\sigma(d)}$ for all $d\in\{1,..,|V_{small}|\}$.
alpar@2
   797
\end{definition}
alpar@2
   798
alpar@2
   799
\begin{claim}\label{claim:MOclaim}
alpar@3
   800
A total ordering is matching order, iff the nodes of every component
alpar@3
   801
form an interval in the node sequence, and every node connects to a
alpar@3
   802
previous node in its component except the first node of the
alpar@3
   803
component. The order of the components is arbitrary.  \\Formally
alpar@3
   804
spoken, an order
alpar@3
   805
$(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(|V_{small}|)})$ of
alpar@3
   806
$V_{small}$ is matching order $\Leftrightarrow$ $\forall
alpar@3
   807
G'_{small}=(V'_{small},E'_{small})\ component\ of\ G_{small}: \forall
alpar@3
   808
i: (\exists j : j<i\wedge u_{\sigma(j)},u_{\sigma(i)}\in
alpar@3
   809
V'_{small})\Rightarrow \exists k : k < i \wedge (\forall l: k\leq
alpar@3
   810
l\leq i \Rightarrow u_{l}\in V'_{small}) \wedge
alpar@3
   811
(u_{\sigma{(k)}},u_{\sigma{(i)}})\in E'_{small}$, where $i,j,k,l\in
alpar@3
   812
\{1,..,|V_{small}|\}$\newline
alpar@2
   813
\end{claim}
alpar@2
   814
\begin{proof}
alpar@3
   815
Suppose a matching order is given. It has to be shown that the node
alpar@3
   816
sequence has a structure described above.\\ Let
alpar@3
   817
$G'_{small}=(V'_{small},E'_{small})$ be an arbitrary component and $i$
alpar@3
   818
an arbitrary index.
alpar@2
   819
\newline
alpar@3
   820
$(\exists j : j<i\wedge u_{\sigma(j)},u_{\sigma(i)}\in
alpar@3
   821
V'_{small})\Rightarrow u_{\sigma(i)}$ is not the first covered node in
alpar@3
   822
$G_{small}\ \Rightarrow u_{\sigma(i)}$ is connected to a covered node
alpar@3
   823
$u_{\sigma(k)}$ where $k<i$, since $u_{\sigma(i)}\in T_{small}(s)$ for
alpar@3
   824
some $s$ and $T_{small}(s)\subseteq{V'_{small}}$ contains only nodes
alpar@3
   825
connected to at least one covered node. It's easy to see, that
alpar@3
   826
$\forall l: k\leq l\leq i \Rightarrow u_{l}\in V'_{small}$, since
alpar@3
   827
$T_{small}(s)$ contains only nodes connected to some covered ones,
alpar@3
   828
while it is not empty, but if it were empty, then $u_{\sigma(k)}$ and
alpar@3
   829
$u_{\sigma(i)}$ were not in the same component.
alpar@2
   830
alpar@3
   831
Now, let us show that if a node sequence has the special structure
alpar@3
   832
described above, it has to be matching
alpar@3
   833
order.\\ $(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(|V_{small}|)})$ is
alpar@3
   834
a total ordering, which determines the matching order
alpar@3
   835
$(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(|V_{small}|)})$.
alpar@2
   836
\end{proof}
alpar@2
   837
alpar@3
   838
To summing up, a total ordering always uniquely determines a matching
alpar@3
   839
order, and every matching order can be determined by a total ordering,
alpar@3
   840
however, more than one different total orderings may determine the
alpar@3
   841
same matching order.
alpar@2
   842
\subsection{Idea behind the algorithm}
alpar@3
   843
The goal is to find a matching order in which the algorithm is able to
alpar@3
   844
recognize inconsistency or prune the infeasible branches on the
alpar@3
   845
highest levels and goes deep only if it is needed.
alpar@2
   846
alpar@2
   847
\begin{notation}
alpar@3
   848
Let $\mathbf{Conn_{H}(u)}:=|\Gamma_{small}(u)\cap H\}|$, that is the
alpar@3
   849
number of neighbours of u which are in H, where $u\in V_{small} $ and
alpar@3
   850
$H\subseteq V_{small}$.
alpar@2
   851
\end{notation}
alpar@2
   852
alpar@3
   853
The principal question is the following. Suppose a state $s$ is
alpar@3
   854
given. For which node of $T_{small}(s)$ is the hardest to find a
alpar@3
   855
consistent pair in $G_{large}$? The more covered neighbours a node in
alpar@3
   856
$T_{small}(s)$ has --- i.e. the largest $Conn_{M_{small}(s)}$ it has
alpar@3
   857
---, the more rarely satisfiable consistency constraints for its pair
alpar@3
   858
are given.
alpar@2
   859
alpar@3
   860
In biology, most of the graphs are sparse, thus several nodes in
alpar@3
   861
$T_{small}(s)$ may have the same $Conn_{M_{small}(s)}$, which makes
alpar@3
   862
reasonable to define a secondary and a tertiary order between them.
alpar@3
   863
The observation above proves itself to be as determining, that the
alpar@3
   864
secondary ordering prefers nodes with the most uncovered neighbours
alpar@3
   865
among which have the same $Conn_{M_{small}(s)}$ to increase
alpar@3
   866
$Conn_{M_{small}(s)}$ of uncovered nodes so much, as possible.  The
alpar@3
   867
tertiary ordering prefers nodes having the rarest uncovered labels.
alpar@2
   868
alpar@3
   869
Note that the secondary ordering is the same as the ordering by $deg$,
alpar@3
   870
which is a static data in front of the above used.
alpar@2
   871
alpar@3
   872
These rules can easily result in a matching order which contains the
alpar@3
   873
nodes of a long path successively, whose nodes may have low $Conn$ and
alpar@3
   874
is easily matchable into $G_{large}$. To avoid that, a BFS order is
alpar@3
   875
used, which provides the shortest possible paths.
alpar@2
   876
\newline
alpar@2
   877
alpar@3
   878
In the following, some examples on which the VF2 may be slow are
alpar@3
   879
described, although they are easily solvable by using a proper
alpar@3
   880
matching order.
alpar@2
   881
alpar@2
   882
\begin{example}
alpar@3
   883
Suppose $G_{small}$ can be mapped into $G_{large}$ in many ways
alpar@3
   884
without node labels. Let $u\in V_{small}$ and $v\in V_{large}$.
alpar@2
   885
\newline
alpar@2
   886
$lab(u):=black$
alpar@2
   887
\newline
alpar@2
   888
$lab(v):=black$
alpar@2
   889
\newline
alpar@3
   890
$lab(\tilde{u}):=red \ \forall \tilde{u}\in (V_{small}\backslash
alpar@3
   891
\{u\})$
alpar@2
   892
\newline
alpar@3
   893
$lab(\tilde{v}):=red \ \forall \tilde{v}\in (V_{large}\backslash
alpar@3
   894
\{v\})$
alpar@2
   895
\newline
alpar@2
   896
alpar@3
   897
Now, any mapping by the node label $lab$ must contain $(u,v)$, since
alpar@3
   898
$u$ is black and no node in $V_{large}$ has a black label except
alpar@3
   899
$v$. If unfortunately $u$ were the last node which will get covered,
alpar@3
   900
VF2 would check only in the last steps, whether $u$ can be matched to
alpar@3
   901
$v$.
alpar@2
   902
\newline
alpar@3
   903
However, had $u$ been the first matched node, u would have been
alpar@3
   904
matched immediately to v, so all the mappings would have been
alpar@3
   905
precluded in which node labels can not correspond.
alpar@2
   906
\end{example}
alpar@2
   907
alpar@2
   908
\begin{example}
alpar@3
   909
Suppose there is no node label given, $G_{small}$ is a small graph and
alpar@3
   910
can not be mapped into $G_{large}$ and $u\in V_{small}$.
alpar@2
   911
\newline
alpar@3
   912
Let $G'_{small}:=(V_{small}\cup
alpar@3
   913
\{u'_{1},u'_{2},..,u'_{k}\},E_{small}\cup
alpar@3
   914
\{(u,u'_{1}),(u'_{1},u'_{2}),..,(u'_{k-1},u'_{k})\})$, that is,
alpar@3
   915
$G'_{small}$ is $G_{small}\cup \{ a\ k$ long path, which is disjoint
alpar@3
   916
from $G_{small}$ and one of its starting points is connected to $u\in
alpar@3
   917
V_{small}\}$.
alpar@2
   918
\newline
alpar@3
   919
Is there a subgraph of $G_{large}$, which is isomorph with
alpar@3
   920
$G'_{small}$?
alpar@2
   921
\newline
alpar@3
   922
If unfortunately the nodes of the path were the first $k$ nodes in the
alpar@3
   923
matching order, the algorithm would iterate through all the possible k
alpar@3
   924
long paths in $G_{large}$, and it would recognize that no path can be
alpar@3
   925
extended to $G'_{small}$.
alpar@2
   926
\newline
alpar@3
   927
However, had it started by the matching of $G_{small}$, it would not
alpar@3
   928
have matched any nodes of the path.
alpar@2
   929
\end{example}
alpar@2
   930
alpar@3
   931
These examples may look artificial, but the same problems also appear
Madarasi@7
   932
in real-world instances, even though in a less obvious way.
alpar@2
   933
alpar@2
   934
\subsection{Total ordering}
alpar@3
   935
Instead of the total ordering relation, the matching order will be
alpar@3
   936
searched directly.
alpar@2
   937
\begin{notation}
alpar@3
   938
Let \textbf{F$_\mathcal{M}$(l)}$:=|\{v\in V_{large} :
alpar@3
   939
l=lab(v)\}|-|\{u\in V_{small}\backslash \mathcal{M} : l=lab(u)\}|$ ,
alpar@3
   940
where $l$ is a label and $\mathcal{M}\subseteq V_{small}$.
alpar@2
   941
\end{notation}
alpar@2
   942
alpar@2
   943
\begin{definition}Let $\mathbf{arg\ max}_{f}(S) :=\{u : u\in S \wedge f(u)=max_{v\in S}\{f(v)\}\}$ and $\mathbf{arg\ min}_{f}(S) := arg\ max_{-f}(S)$, where $S$ is a finite set and $f:S\longrightarrow \mathbb{R}$.
alpar@2
   944
\end{definition}
alpar@2
   945
alpar@2
   946
\begin{algorithm}
alpar@3
   947
\algtext*{EndIf}%ne nyomtasson end if-et \algtext*{EndFor}%ne
alpar@3
   948
nyomtasson ..  \algtext*{EndProcedure}%ne nyomtasson ..
alpar@2
   949
\algtext*{EndWhile}
alpar@2
   950
\caption{\hspace{0.5cm}$The\ method\ of\ VF2++\ for\ determining\ the\ node\ order$}\label{alg:VF2PPPseu}
alpar@2
   951
\begin{algorithmic}[1]
alpar@3
   952
\Procedure{VF2++order}{} \State $\mathcal{M}$ := $\emptyset$
alpar@3
   953
\Comment{matching order} \While{$V_{small}\backslash \mathcal{M}
alpar@3
   954
  \neq\emptyset$} \State $r\in$ arg max$_{deg}$ (arg
alpar@3
   955
min$_{F_\mathcal{M}\circ lab}(V_{small}\backslash
alpar@3
   956
\mathcal{M})$)\label{alg:findMin} \State Compute $T$, a BFS tree with
alpar@3
   957
root node $r$.  \For{$d=0,1,...,depth(T)$} \State $V_d$:=nodes of the
alpar@3
   958
$d$-th level \State Process $V_d$ \Comment{See Algorithm
alpar@4
   959
  \ref{alg:VF2PPProcess1}) and \ref{alg:VF2PPProcess2}} \EndFor
alpar@3
   960
\EndWhile \EndProcedure
alpar@2
   961
\end{algorithmic}
alpar@2
   962
\end{algorithm}
alpar@2
   963
alpar@2
   964
\begin{algorithm}
alpar@3
   965
\algtext*{EndIf}%ne nyomtasson end if-et \algtext*{EndFor}%ne
alpar@3
   966
nyomtasson ..  \algtext*{EndProcedure}%ne nyomtasson ..
alpar@2
   967
\algtext*{EndWhile}
alpar@2
   968
\caption{\hspace{.5cm}$A\ method\ for\ processing\ a\ level\ of\ the\ BFS\ tree$}\label{alg:VF2PPProcess1}
alpar@2
   969
\begin{algorithmic}[1]
alpar@3
   970
\Procedure{VF2++ProcessLevel1}{$V_{d}$} \While{$V_d\neq\emptyset$}
alpar@3
   971
\State $m\in$ arg min$_{F_\mathcal{M}\circ\ lab}($ arg max$_{deg}($arg
alpar@3
   972
max$_{Conn_{\mathcal{M}}}(V_{d})))$ \State $V_d:=V_d\backslash m$
alpar@3
   973
\State Append node $m$ to the end of $\mathcal{M}$ \State Refresh
alpar@3
   974
$F_\mathcal{M}$ \EndWhile \EndProcedure
alpar@2
   975
\end{algorithmic}
alpar@2
   976
\end{algorithm}
alpar@2
   977
alpar@2
   978
\begin{algorithm}
alpar@3
   979
\algtext*{EndIf}%ne nyomtasson end if-et \algtext*{EndFor}%ne
alpar@3
   980
nyomtasson ..  \algtext*{EndProcedure}%ne nyomtasson ..
alpar@2
   981
\algtext*{EndWhile}
alpar@2
   982
\caption{\hspace{0.5cm}$Another\ method\ for\ processing\ a\ level\ of\ the\ BFS\ tree$}\label{alg:VF2PPProcess2}
alpar@2
   983
\begin{algorithmic}[1]
alpar@3
   984
\Procedure{VF2++ProcessLevel2}{$V_{d}$} \State Sort $V_d$ in
alpar@3
   985
descending lex. order by $(Conn_{\mathcal{M}},deg,-F_\mathcal{M})$
alpar@3
   986
\State Append the sorted $V_d$ to the end of $\mathcal{M}$ \State
alpar@3
   987
Refresh $F_\mathcal{M}$ \EndProcedure
alpar@2
   988
\end{algorithmic}
alpar@2
   989
\end{algorithm}
alpar@4
   990
Algorithm~\ref{alg:VF2PPPseu} is a high level description of the
alpar@4
   991
matching order procedure of VF2++. It computes a BFS tree for each
alpar@3
   992
component in ascending order of their rarest $lab$ and largest $deg$,
alpar@4
   993
whose root vertex is the component's minimal
alpar@4
   994
node. Algorithms~\ref{alg:VF2PPProcess1} and \ref{alg:VF2PPProcess2}
alpar@4
   995
are two different methods to process a level of the BFS tree.
alpar@2
   996
alpar@3
   997
After sorting the nodes of the current level in descending
alpar@3
   998
lexicographic order by $(Conn_{\mathcal{M}},deg,-F_\mathcal{M})$,
alpar@4
   999
Algorithm~\ref{alg:VF2PPProcess2} appends them simultaneously to the
alpar@4
  1000
matching order $\mathcal{M}$ and refreshes $F_\mathcal{M}$ just once,
alpar@4
  1001
whereas Algorithm~\ref{alg:VF2PPProcess1} appends the nodes separately
alpar@4
  1002
to $\mathcal{M}$ and refreshes $F_\mathcal{M}$ immediately, so it
alpar@3
  1003
provides up-to-date label information and may result in a more
alpar@3
  1004
efficient matching order.
alpar@2
  1005
alpar@4
  1006
Claim~\ref{claim:MOclaim} shows that Algorithm~\ref{alg:VF2PPPseu}
alpar@4
  1007
provides a matching order.
alpar@2
  1008
alpar@2
  1009
alpar@2
  1010
\subsection{Cutting rules}
alpar@2
  1011
\label{VF2PPCuttingRules}
alpar@3
  1012
This section presents the cutting rules of VF2++, which are improved
alpar@3
  1013
by using extra information coming from the node labels.
alpar@2
  1014
\begin{notation}
alpar@3
  1015
Let $\mathbf{\Gamma_{small}^{l}(u)}:=\{\tilde{u} : lab(\tilde{u})=l
alpar@3
  1016
\wedge \tilde{u}\in \Gamma_{small} (u)\}$ and
alpar@3
  1017
$\mathbf{\Gamma_{large}^{l}(v)}:=\{\tilde{v} : lab(\tilde{v})=l \wedge
alpar@3
  1018
\tilde{v}\in \Gamma_{large} (v)\}$, where $u\in V_{small}$, $v\in
alpar@3
  1019
V_{large}$ and $l$ is a label.
alpar@2
  1020
\end{notation}
alpar@2
  1021
alpar@2
  1022
\subsubsection{Induced subgraph isomorphism}
alpar@2
  1023
\begin{claim}
alpar@2
  1024
\[LabCut_{IND}((u,v),M(s))\!:=\!\!\!\!\!\bigvee_{l\ is\ label}\!\!\!\!\!\!\!|\Gamma_{large}^{l} (v) \cap T_{large}(s)|\!<\!|\Gamma_{small}^{l}(u)\cap T_{small}(s)|\ \vee\]\[\bigvee_{l\ is\ label} \newline |\Gamma_{large}^{l}(v)\cap \tilde{T}_{large}(s)| < |\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)|\] is a cutting function by IND.
alpar@2
  1025
\end{claim}
alpar@2
  1026
\begin{proof}
alpar@3
  1027
It has to be shown, that $LabCut_{IND}((u,v),M(s))=true\Rightarrow$
alpar@3
  1028
the mapping can not be extended to a whole
alpar@3
  1029
mapping.\\ $LabCut_{IND}((u,v),M(s))=true,$ iff the following
alpar@3
  1030
holds. $\\\exists l: |\Gamma_{large}^{l} (v)\ \cap\ T_{large}(s)| <
alpar@3
  1031
|\Gamma_{small}^{l} (u)\ \cap\ T_{small}(s)| \vee
alpar@3
  1032
|\Gamma_{large}^{l}(v)\cap \tilde{T}_{large}(s)| <
alpar@3
  1033
|\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)|$.
alpar@2
  1034
alpar@3
  1035
Suppose that $|\Gamma_{large}^{l} (v)\ \cap\ T_{large}(s)| <
alpar@3
  1036
|\Gamma_{small}^{l} (u)\ \cap\ T_{small}(s)|$. Each node of
alpar@3
  1037
$\Gamma_{small}^{l} (u)\ \cap\ T_{small}(s)$ has to be matched to a
alpar@3
  1038
node in $\Gamma_{large}^{l} (v)\ \cap\ T_{large}(s)$, so
alpar@3
  1039
$\Gamma_{large}^{l} (v)\ \cap\ T_{large}(s)$ can not be smaller than
alpar@3
  1040
$\Gamma_{small}^{l} (u)\ \cap\ T_{small}(s)$. That is why $M(s)$ can
alpar@3
  1041
not be extended to a whole mapping.
alpar@2
  1042
alpar@3
  1043
Otherwise $|\Gamma_{large}^{l}(v)\cap \tilde{T}_{large}(s)| <
alpar@3
  1044
|\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)|$ has to be
alpar@3
  1045
true. Similarly, each node of $\Gamma_{large}^{l}(v)\cap
alpar@3
  1046
\tilde{T}_{large}(s)$ has to be matched to a node in
alpar@3
  1047
$\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)$,
alpar@3
  1048
i.e. $\Gamma_{large}^{l}(v)\cap \tilde{T}_{large}(s)$ can not be
alpar@3
  1049
smaller than $\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)$, if
alpar@3
  1050
$M(s)$ is extendible.
alpar@2
  1051
\end{proof}
alpar@2
  1052
The following claims can be proven similarly.
alpar@2
  1053
\subsubsection{Graph isomorphism}
alpar@2
  1054
\begin{claim}
alpar@2
  1055
\[LabCut_{ISO}((u,v),M(s))\!:=\!\!\!\!\!\bigvee_{l\ is\ label}\!\!\!\!\!\!\!|\Gamma_{large}^{l} (v) \cap T_{large}(s)|\!\neq\!|\Gamma_{small}^{l}(u)\cap T_{small}(s)|\  \vee\]\[\bigvee_{l\ is\ label} \newline |\Gamma_{large}^{l}(v)\cap \tilde{T}_{large}(s)| \neq |\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)|\] is a cutting function by ISO.
alpar@2
  1056
\end{claim}
alpar@2
  1057
alpar@2
  1058
\subsubsection{Subgraph isomorphism}
alpar@2
  1059
\begin{claim}
Madarasi@7
  1060
\[LabCut_{SUB}((u,v),M(s))\!:=\!\!\!\!\!\bigvee_{l\ is\ label}\!\!\!\!\!\!\!|\Gamma_{large}^{l} (v) \cap T_{large}(s)|\!<\!|\Gamma_{small}^{l}(u)\cap T_{small}(s)|\] is a cutting function by SUB.
alpar@2
  1061
\end{claim}
alpar@2
  1062
alpar@2
  1063
alpar@2
  1064
alpar@2
  1065
\subsection{Implementation details}
alpar@3
  1066
This section provides a detailed summary of an efficient
alpar@3
  1067
implementation of VF2++.
alpar@2
  1068
\subsubsection{Storing a mapping}
alpar@3
  1069
After fixing an arbitrary node order ($u_0, u_1, ..,
alpar@3
  1070
u_{|G_{small}|-1}$) of $G_{small}$, an array $M$ is usable to store
alpar@3
  1071
the current mapping in the following way.
alpar@2
  1072
\[
alpar@3
  1073
 M[i] =
alpar@2
  1074
  \begin{cases} 
alpar@3
  1075
   v & if\ (u_i,v)\ is\ in\ the\ mapping\\ INVALID &
alpar@3
  1076
   if\ no\ node\ has\ been\ mapped\ to\ u_i.
alpar@2
  1077
  \end{cases}
alpar@2
  1078
\]
alpar@3
  1079
Where $i\in\{0,1, ..,|G_{small}|-1\}$, $v\in V_{large}$ and $INVALID$
alpar@3
  1080
means "no node".
alpar@2
  1081
\subsubsection{Avoiding the recurrence}
alpar@3
  1082
Exploring the state space was described in a recursive fashion using
alpar@4
  1083
sets (see Algorithm~\ref{alg:VF2Pseu}), which makes the
alpar@3
  1084
algorithm easy to understand but does not show directly an efficient
alpar@3
  1085
way of implementation. The following approach avoiding recursion and
alpar@3
  1086
using lookup tables instead of sets is not only fast but has linear
alpar@3
  1087
space complexity as well.
alpar@2
  1088
alpar@4
  1089
The recursion of Algorithm~\ref{alg:VF2Pseu} can be realized
alpar@3
  1090
as a while loop, which has a loop counter $depth$ denoting the
alpar@3
  1091
all-time depth of the recursion.  Fixing a matching order, let $M$
alpar@3
  1092
denote the array storing the all-time mapping.  The initial state is
alpar@3
  1093
associated with the empty mapping, which means that $\forall i:
alpar@3
  1094
M[i]=INVALID$ and $depth=0$.  In case of a recursive call, $depth$ has
alpar@3
  1095
to be incremented, while in case of a return, it has to be
alpar@4
  1096
decremented.  Based on Claim~\ref{claim:claimCoverFromLeft},
alpar@3
  1097
$M$ is $INVALID$ from index $depth$+1 and not $INVALID$ before
alpar@3
  1098
$depth$, i.e. $\forall i: i < depth \Rightarrow M[i]\neq INVALID$ and
alpar@3
  1099
$\forall i: i > depth \Rightarrow M[i]= INVALID$. $M[depth]$ changes
alpar@3
  1100
while the state is being processed, but the property is held before
alpar@3
  1101
both stepping back to a predecessor state and exploring a successor
alpar@3
  1102
state.
alpar@2
  1103
alpar@3
  1104
The necessary part of the candidate set is easily maintainable or
alpar@3
  1105
computable by following
alpar@4
  1106
Section~\ref{candidateComputingVF2}. A much faster method
alpar@3
  1107
has been designed for biological- and sparse graphs, see the next
alpar@3
  1108
section for details.
alpar@2
  1109
alpar@2
  1110
\subsubsection{Calculating the candidates for a node}
alpar@4
  1111
Being aware of Claim~\ref{claim:claimCoverFromLeft}, the
alpar@3
  1112
task is not to maintain the candidate set, but to generate the
alpar@3
  1113
candidate nodes in $G_{large}$ for a given node $u\in V_{small}$.  In
alpar@3
  1114
case of an expanding problem type and $M$ mapping, if a node $v\in
alpar@3
  1115
V_{large}$ is a potential pair of $u\in V_{small}$, then $\forall
alpar@3
  1116
u'\in V_{small} : (u,u')\in
alpar@3
  1117
E_{small}\ and\ u'\ is\ covered\ by\ M\ \Rightarrow (v,Pair(M,u'))\in
alpar@3
  1118
E_{large}$. That is, each covered neighbour of $u$ has to be mapped to
alpar@3
  1119
a covered neighbour of $v$.
alpar@2
  1120
alpar@3
  1121
Having said that, an algorithm running in $\Theta(deg)$ time is
alpar@3
  1122
describable if there exists a covered node in the component containing
alpar@3
  1123
$u$. In this case choose a covered neighbour $u'$ of $u$ arbitrarily
alpar@3
  1124
--- such a node exists based on
alpar@4
  1125
Claim~\ref{claim:MOclaim}. With all the candidates of $u$
alpar@3
  1126
being among the uncovered neighbours of $Pair(M,u')$, there are solely
alpar@3
  1127
$deg(Pair(M,u'))$ nodes to check.
alpar@2
  1128
alpar@3
  1129
An easy trick is to choose an $u'$, for which
alpar@3
  1130
$|\{uncovered\ neighbours\ $ $of\ Pair(M,u')\}|$ is the smallest
alpar@3
  1131
possible.
alpar@2
  1132
alpar@3
  1133
Note that if $u$ is the first node of its component, then all the
alpar@3
  1134
uncovered nodes of $G_{large}$ are candidates, so giving a sublinear
alpar@3
  1135
method is impossible.
alpar@2
  1136
alpar@2
  1137
alpar@2
  1138
\subsubsection{Determining the node order}
alpar@3
  1139
This section describes how the node order preprocessing method of
alpar@3
  1140
VF2++ can efficiently be implemented.
alpar@2
  1141
alpar@3
  1142
For using lookup tables, the node labels are associated with the
alpar@3
  1143
numbers $\{0,1,..,|K|-1\}$, where $K$ is the set of the labels. It
alpar@3
  1144
enables $F_\mathcal{M}$ to be stored in an array, for which
alpar@3
  1145
$F_\mathcal{M}[i]=F_\mathcal{M}(i)$ where $i=0,1,..,|K|-1$. At first,
alpar@3
  1146
$\mathcal{M}=\emptyset$, so $F_\mathcal{M}[i]$ is the number of nodes
alpar@3
  1147
in $V_{small}$ having label i, which is easy to compute in
alpar@3
  1148
$\Theta(|V_{small}|)$ steps.
alpar@2
  1149
alpar@3
  1150
$\mathcal{M}\subseteq V_{small}$ can be represented as an array of
alpar@3
  1151
size $|V_{small}|$.
alpar@2
  1152
alpar@3
  1153
The BFS tree is computed by using a FIFO data structure which is
alpar@3
  1154
usually implemented as a linked list, but one can avoid it by using
alpar@3
  1155
the array $\mathcal{M}$ itself. $\mathcal{M}$ contains all the nodes
alpar@3
  1156
seen before, a pointer shows where the first node of the FIFO is, and
alpar@3
  1157
another one shows where the next explored node has to be inserted. So
alpar@3
  1158
the nodes of each level of the BFS tree can be processed by
alpar@4
  1159
Algorithms~\ref{alg:VF2PPProcess1} and
alpar@4
  1160
\ref{alg:VF2PPProcess2} in place by swapping nodes.
alpar@2
  1161
alpar@3
  1162
After a node $u$ gets to the next place of the node order,
alpar@3
  1163
$F_\mathcal{M}[lab[u]]$ has to be decreased by one, because there is
alpar@3
  1164
one less covered node in $V_{large}$ with label $lab(u)$, that is why
alpar@3
  1165
min selection sort is preferred which gives the elements from left to
alpar@4
  1166
right in descending order, see Algorithm~\ref{alg:VF2PPProcess1}.
alpar@2
  1167
alpar@3
  1168
Note that using a $\Theta(n^2)$ sort absolutely does not slow down the
alpar@3
  1169
procedure on biological (and on sparse) graphs, since they have few
alpar@3
  1170
nodes on a level. If a level had a large number of nodes,
alpar@4
  1171
Algorithm~\ref{alg:VF2PPProcess2} would seem to be a better
alpar@3
  1172
choice with a $\Theta(nlog(n))$ or Bucket sort, but it may reduce the
alpar@3
  1173
efficiency of the matching procedure, since $F_\mathcal{M}(i)$ can not
alpar@3
  1174
be immediately refreshed, so it is unable to provide up-to-date label
alpar@3
  1175
information.
alpar@2
  1176
alpar@4
  1177
Note that the \textit{while loop} of Algorithm~\ref{alg:VF2PPPseu}
alpar@4
  1178
takes one iteration per graph component and the graphs in biology are
alpar@4
  1179
mostly connected.
alpar@4
  1180
alpar@2
  1181
\subsubsection{Cutting rules}
alpar@4
  1182
In Section~\ref{VF2PPCuttingRules}, the cutting rules were
alpar@3
  1183
described using the sets $T_{small}$, $T_{large}$, $\tilde T_{small}$
alpar@3
  1184
and $\tilde T_{large}$, which are dependent on the all-time mapping
alpar@3
  1185
(i.e. on the all-time state). The aim is to check the labeled cutting
alpar@3
  1186
rules of VF2++ in $\Theta(deg)$ time.
alpar@2
  1187
alpar@3
  1188
Firstly, suppose that these four sets are given in such a way, that
alpar@3
  1189
checking whether a node is in a certain set takes constant time,
alpar@3
  1190
e.g. they are given by their 0-1 characteristic vectors. Let $L$ be an
alpar@3
  1191
initially zero integer lookup table of size $|K|$. After incrementing
alpar@3
  1192
$L[lab(u')]$ for all $u'\in \Gamma_{small}(u) \cap T_{small}(s)$ and
alpar@3
  1193
decrementing $L[lab(v')]$ for all $v'\in\Gamma_{large} (v) \cap
alpar@3
  1194
T_{large}(s)$, the first part of the cutting rules is checkable in
alpar@3
  1195
$\Theta(deg)$ time by considering the proper signs of $L$. Setting $L$
alpar@3
  1196
to zero takes $\Theta(deg)$ time again, which makes it possible to use
alpar@3
  1197
the same table through the whole algorithm.  The second part of the
alpar@3
  1198
cutting rules can be verified using the same method with $\tilde
alpar@3
  1199
T_{small}$ and $\tilde T_{large}$ instead of $T_{small}$ and
alpar@3
  1200
$T_{large}$. Thus, the overall complexity is $\Theta(deg)$.
alpar@2
  1201
alpar@3
  1202
An other integer lookup table storing the number of covered neighbours
alpar@3
  1203
of each node in $G_{large}$ gives all the information about the sets
alpar@3
  1204
$T_{large}$ and $\tilde T_{large}$, which is maintainable in
alpar@3
  1205
$\Theta(deg)$ time when a pair is added or substracted by incrementing
alpar@3
  1206
or decrementing the proper indices. A further improvement is that the
alpar@3
  1207
values of $L[lab(u')]$ in case of checking $u$ is dependent only on
alpar@3
  1208
$u$, i.e. on the size of the mapping, so for each $u\in V_{small}$ an
alpar@3
  1209
array of pairs (label, number of such labels) can be stored to skip
alpar@3
  1210
the maintaining operations. Note that these arrays are at most of size
alpar@3
  1211
$deg$. Skipping this trick, the number of covered neighbours has to be
alpar@3
  1212
stored for each node of $G_{small}$ as well to get the sets
alpar@3
  1213
$T_{small}$ and $\tilde T_{small}$.
alpar@2
  1214
alpar@3
  1215
Using similar tricks, the consistency function can be evaluated in
alpar@3
  1216
$\Theta(deg)$ steps, as well.
alpar@2
  1217
alpar@2
  1218
\section{The VF2 Plus Algorithm}
alpar@3
  1219
The VF2 Plus algorithm is a recently improved version of VF2. It was
alpar@3
  1220
compared with the state of the art algorithms in \cite{VF2Plus} and
alpar@3
  1221
has proven itself to be competitive with RI, the best algorithm on
alpar@3
  1222
biological graphs.  \\ A short summary of VF2 Plus follows, which uses
alpar@3
  1223
the notation and the conventions of the original paper.
alpar@2
  1224
alpar@2
  1225
\subsection{Ordering procedure}
alpar@3
  1226
VF2 Plus uses a sorting procedure that prefers nodes in $V_{small}$
alpar@3
  1227
with the lowest probability to find a pair in $V_{small}$ and the
alpar@3
  1228
highest number of connections with the nodes already sorted by the
alpar@3
  1229
algorithm.
alpar@2
  1230
alpar@2
  1231
\begin{definition}
alpar@3
  1232
$(u,v)$ is a \textbf{feasible pair}, if $lab(u)=lab(v)$ and
alpar@3
  1233
  $deg(u)\leq deg(v)$, where $u\in{V_{small}}$ and $ v\in{V_{large}}$.
alpar@2
  1234
\end{definition}
alpar@3
  1235
$P_{lab}(L):=$ a priori probability to find a node with label $L$ in
alpar@3
  1236
$V_{large}$
alpar@2
  1237
\newline
alpar@3
  1238
$P_{deg}(d):=$ a priori probability to find a node with degree $d$ in
alpar@3
  1239
$V_{large}$
alpar@2
  1240
\newline
alpar@3
  1241
$P(u):=P_{lab}(L)*\bigcup_{d'>d}P_{deg}(d')$\\ $M$ is the set of
alpar@3
  1242
already sorted nodes, $T$ is the set of nodes candidate to be
alpar@3
  1243
selected, and $degreeM$ of a node is the number of its neighbours in
alpar@3
  1244
$M$.
alpar@2
  1245
\begin{algorithm}
alpar@3
  1246
\algtext*{EndIf}%ne nyomtasson end if-et \algtext*{EndFor}%ne
alpar@3
  1247
nyomtasson ..  \algtext*{EndProcedure}%ne nyomtasson ..
alpar@2
  1248
\algtext*{EndWhile}
alpar@2
  1249
\caption{}\label{alg:VF2PlusPseu}
alpar@2
  1250
\begin{algorithmic}[1]
alpar@3
  1251
\Procedure{VF2 Plus order}{} \State Select the node with the lowest
alpar@3
  1252
$P$.  \If {more nodes share the same $P$} \State select the one with
alpar@3
  1253
maximum degree \EndIf \If {more nodes share the same $P$ and have the
alpar@3
  1254
  max degree} \State select the first \EndIf \State Put the selected
alpar@3
  1255
node in the set $M$. \label{alg:putIn} \State Put all its unsorted
alpar@3
  1256
neighbours in the set $T$.  \If {$M\neq V_{small}$} \State From set
alpar@3
  1257
$T$ select the node with maximum $degreeM$.  \If {more nodes have
alpar@3
  1258
  maximum $degreeM$} \State Select the one with the lowest $P$ \EndIf
alpar@3
  1259
\If {more nodes have maximum $degreeM$ and $P$} \State Select the
alpar@3
  1260
first.  \EndIf \State \textbf{goto \ref{alg:putIn}.}  \EndIf
alpar@2
  1261
\EndProcedure
alpar@2
  1262
\end{algorithmic}
alpar@2
  1263
\end{algorithm}
alpar@2
  1264
alpar@4
  1265
Using these notations, Algorithm~\ref{alg:VF2PlusPseu}
alpar@3
  1266
provides the description of the sorting procedure.
alpar@2
  1267
alpar@3
  1268
Note that $P(u)$ is not the exact probability of finding a consistent
alpar@3
  1269
pair for $u$ by choosing a node of $V_{large}$ randomly, since
alpar@3
  1270
$P_{lab}$ and $P_{deg}$ are not independent, though calculating the
alpar@3
  1271
real probability would take quadratic time, which may be reduced by
alpar@3
  1272
using fittingly lookup tables.
alpar@2
  1273
alpar@2
  1274
\section{Experimental results}
alpar@3
  1275
This section compares the performance of VF2++ and VF2 Plus. Both
alpar@3
  1276
algorithms have run faster with orders of magnitude than VF2, thus its
alpar@3
  1277
inclusion was not reasonable.
alpar@2
  1278
\subsection{Biological graphs}
alpar@3
  1279
The tests have been executed on a recent biological dataset created
alpar@3
  1280
for the International Contest on Pattern Search in Biological
Madarasi@7
  1281
Databases\cite{Content}, which has been constructed of molecule,
Madarasi@7
  1282
protein and contact map graphs extracted from the Protein Data
alpar@3
  1283
Bank\cite{ProteinDataBank}.
alpar@2
  1284
alpar@3
  1285
The molecule dataset contains small graphs with less than 100 nodes
alpar@3
  1286
and an average degree of less than 3. The protein dataset contains
alpar@3
  1287
graphs having 500-10 000 nodes and an average degree of 4, while the
alpar@3
  1288
contact map dataset contains graphs with 150-800 nodes and an average
alpar@3
  1289
degree of 20.  \\
alpar@2
  1290
alpar@3
  1291
In the following, the induced subgraph isomorphism and the graph
alpar@3
  1292
isomorphism will be examined.
alpar@2
  1293
Madarasi@7
  1294
This dataset provides graph pairs, between which all the induced subgraph isomorphisms have to be found. For run time results, please see Figure~\ref{fig:bioIND}.
Madarasi@7
  1295
Madarasi@7
  1296
In an other experiment, the nodes of each graph in the database had been
Madarasi@7
  1297
shuffled, and an isomorphism between the shuffled and the original
Madarasi@7
  1298
graph was searched. The solution times are shown on Figure~\ref{fig:bioISO}.
Madarasi@7
  1299
Madarasi@7
  1300
alpar@2
  1301
alpar@2
  1302
\begin{figure}[H]
Madarasi@7
  1303
\vspace*{-2cm}
Madarasi@7
  1304
\hspace*{-1.5cm}
Madarasi@7
  1305
\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7
  1306
\begin{figure}[H]
Madarasi@7
  1307
\begin{tikzpicture}[trim axis left, trim axis right]
Madarasi@7
  1308
\begin{axis}[title=Molecules ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
Madarasi@7
  1309
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
Madarasi@7
  1310
  west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7
  1311
  format/1000 sep = \thinspace}]
Madarasi@7
  1312
%\addplot+[only marks] table {proteinsOrig.txt};
Madarasi@7
  1313
\addplot table {Orig/moleculesIso.txt}; \addplot[mark=triangle*,mark
Madarasi@7
  1314
  size=1.8pt,color=red] table {VF2PPLabel/moleculesIso.txt};
Madarasi@7
  1315
\end{axis}
Madarasi@7
  1316
\end{tikzpicture}
Madarasi@7
  1317
\caption{In the case of molecules, there is not such a significant
Madarasi@7
  1318
  difference, but VF2++ seems to be faster as the number of nodes
Madarasi@7
  1319
  increases.}\label{fig:ISOMolecule}
Madarasi@7
  1320
\end{figure}
Madarasi@7
  1321
\end{subfigure}
Madarasi@7
  1322
\hspace*{1.5cm}
Madarasi@7
  1323
\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7
  1324
\begin{figure}[H]
Madarasi@7
  1325
\begin{tikzpicture}[trim axis left, trim axis right]
Madarasi@7
  1326
\begin{axis}[title=Contact maps ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
Madarasi@7
  1327
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
Madarasi@7
  1328
  west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7
  1329
  format/1000 sep = \thinspace}]
Madarasi@7
  1330
%\addplot+[only marks] table {proteinsOrig.txt};
Madarasi@7
  1331
\addplot table {Orig/contactMapsIso.txt}; \addplot[mark=triangle*,mark
Madarasi@7
  1332
  size=1.8pt,color=red] table {VF2PPLabel/contactMapsIso.txt};
Madarasi@7
  1333
\end{axis}
Madarasi@7
  1334
\end{tikzpicture}
Madarasi@7
  1335
\caption{The results are closer to each other on contact maps, but
Madarasi@7
  1336
  VF2++ still performs consistently better.}\label{fig:ISOContact}
Madarasi@7
  1337
\end{figure}
Madarasi@7
  1338
\end{subfigure}
Madarasi@7
  1339
alpar@2
  1340
\begin{center}
Madarasi@7
  1341
\vspace*{-0.5cm}
Madarasi@7
  1342
\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7
  1343
\begin{figure}[H]
Madarasi@7
  1344
\begin{tikzpicture}[trim axis left, trim axis right]
Madarasi@7
  1345
\begin{axis}[title=Proteins ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
Madarasi@7
  1346
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
Madarasi@7
  1347
  west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7
  1348
  format/1000 sep = \thinspace}]
Madarasi@7
  1349
%\addplot+[only marks] table {proteinsOrig.txt};
Madarasi@7
  1350
\addplot table {Orig/proteinsIso.txt}; \addplot[mark=triangle*,mark
Madarasi@7
  1351
  size=1.8pt,color=red] table {VF2PPLabel/proteinsIso.txt};
Madarasi@7
  1352
\end{axis}
Madarasi@7
  1353
\end{tikzpicture}
Madarasi@7
  1354
\caption{On protein graphs, VF2 Plus has a super linear time
Madarasi@7
  1355
  complexity, while VF2++ runs in near constant time. The difference
Madarasi@7
  1356
  is about two order of magnitude on large graphs.}\label{fig:ISOProt}
Madarasi@7
  1357
\end{figure}
Madarasi@7
  1358
\end{subfigure}
Madarasi@7
  1359
\end{center}
Madarasi@7
  1360
\vspace*{-0.6cm}
Madarasi@7
  1361
\caption{\normalsize{Graph isomomorphism on biological graphs}}\label{fig:bioISO}
Madarasi@7
  1362
\end{figure}
Madarasi@7
  1363
Madarasi@7
  1364
Madarasi@7
  1365
\begin{figure}[H]
Madarasi@7
  1366
\vspace*{-2cm}
Madarasi@7
  1367
\hspace*{-1.5cm}
Madarasi@7
  1368
\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7
  1369
\begin{figure}[H]
Madarasi@7
  1370
\begin{tikzpicture}[trim axis left, trim axis right]
Madarasi@7
  1371
\begin{axis}[title=Molecules IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
Madarasi@7
  1372
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
Madarasi@7
  1373
  west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7
  1374
  format/1000 sep = \thinspace}]
Madarasi@7
  1375
%\addplot+[only marks] table {proteinsOrig.txt};
Madarasi@7
  1376
\addplot table {Orig/Molecules.32.txt}; \addplot[mark=triangle*,mark
Madarasi@7
  1377
  size=1.8pt,color=red] table {VF2PPLabel/Molecules.32.txt};
Madarasi@7
  1378
\end{axis}
Madarasi@7
  1379
\end{tikzpicture}
Madarasi@7
  1380
\caption{In the case of molecules, the algorithms have
Madarasi@7
  1381
  similar behaviour, but VF2++ is almost two times faster even on such
Madarasi@7
  1382
  small graphs.} \label{fig:INDMolecule}
Madarasi@7
  1383
\end{figure}
Madarasi@7
  1384
\end{subfigure}
Madarasi@7
  1385
\hspace*{1.5cm}
Madarasi@7
  1386
\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7
  1387
\begin{figure}[H]
Madarasi@7
  1388
\begin{tikzpicture}[trim axis left, trim axis right]
Madarasi@7
  1389
\begin{axis}[title=Contact maps IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
Madarasi@7
  1390
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
Madarasi@7
  1391
  west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7
  1392
  format/1000 sep = \thinspace}]
Madarasi@7
  1393
%\addplot+[only marks] table {proteinsOrig.txt};
Madarasi@7
  1394
\addplot table {Orig/ContactMaps.128.txt};
Madarasi@7
  1395
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
Madarasi@7
  1396
        {VF2PPLabel/ContactMaps.128.txt};
Madarasi@7
  1397
\end{axis}
Madarasi@7
  1398
\end{tikzpicture}
Madarasi@7
  1399
\caption{On contact maps, VF2++ runs in near constant time, while VF2
Madarasi@7
  1400
  Plus has a near linear behaviour.} \label{fig:INDContact}
Madarasi@7
  1401
\end{figure}
Madarasi@7
  1402
\end{subfigure}
Madarasi@7
  1403
Madarasi@7
  1404
\begin{center}
Madarasi@7
  1405
\vspace*{-0.5cm}
Madarasi@7
  1406
\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7
  1407
\begin{figure}[H]
Madarasi@7
  1408
\begin{tikzpicture}[trim axis left, trim axis right]
alpar@2
  1409
  \begin{axis}[title=Proteins IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
alpar@3
  1410
  =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1411
    west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1412
    format/1000 sep = \thinspace}] %\addplot+[only marks] table
alpar@3
  1413
    {proteinsOrig.txt}; \addplot[mark=*,mark size=1.2pt,color=blue]
alpar@3
  1414
    table {Orig/Proteins.256.txt}; \addplot[mark=triangle*,mark
alpar@3
  1415
      size=1.8pt,color=red] table {VF2PPLabel/Proteins.256.txt};
alpar@2
  1416
  \end{axis}
alpar@2
  1417
  \end{tikzpicture}
alpar@3
  1418
\caption{Both the algorithms have linear behaviour on protein
alpar@3
  1419
  graphs. VF2++ is more than 10 times faster than VF2
alpar@3
  1420
  Plus.} \label{fig:INDProt}
alpar@2
  1421
\end{figure}
Madarasi@7
  1422
\end{subfigure}
alpar@2
  1423
\end{center}
Madarasi@7
  1424
\vspace*{-0.5cm}
Madarasi@7
  1425
\caption{\normalsize{Graph isomomorphism on biological graphs}}\label{fig:bioIND}
alpar@2
  1426
\end{figure}
alpar@2
  1427
alpar@2
  1428
alpar@2
  1429
alpar@2
  1430
alpar@2
  1431
alpar@2
  1432
\subsection{Random graphs}
alpar@3
  1433
This section compares VF2++ with VF2 Plus on random graphs of a large
alpar@3
  1434
size. The node labels are uniformly distributed.  Let $\delta$ denote
alpar@3
  1435
the average degree.  For the parameters of problems solved in the
alpar@3
  1436
experiments, please see the top of each chart.
alpar@2
  1437
\subsubsection{Graph isomorphism}
alpar@3
  1438
To evaluate the efficiency of the algorithms in the case of graph
alpar@3
  1439
isomorphism, connected graphs of less than 20 000 nodes have been
alpar@3
  1440
considered. Generating a random graph and shuffling its nodes, an
Madarasi@7
  1441
isomorphism had to be found. Figure \ref{fig:randISO} shows the runtime results
alpar@4
  1442
on graph sets of various density.
alpar@2
  1443
Madarasi@7
  1444
Madarasi@7
  1445
Madarasi@7
  1446
alpar@2
  1447
\begin{figure}[H]
Madarasi@7
  1448
\vspace*{-1.5cm}
Madarasi@7
  1449
\hspace*{-1.5cm}
Madarasi@7
  1450
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1451
\begin{center}
alpar@2
  1452
\begin{tikzpicture}
Madarasi@7
  1453
\begin{axis}[title={Random ISO, $\delta = 5$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1454
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1455
  west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7
  1456
  format/1000 sep = \space}]
alpar@2
  1457
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1458
\addplot table {randGraph/iso/vf2pIso5_1.txt};
alpar@3
  1459
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1460
        {randGraph/iso/vf2ppIso5_1.txt};
alpar@2
  1461
\end{axis}
alpar@2
  1462
\end{tikzpicture}
alpar@2
  1463
\end{center}
Madarasi@7
  1464
\end{subfigure}
Madarasi@7
  1465
%\hspace{1cm}
Madarasi@7
  1466
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1467
\begin{center}
alpar@2
  1468
\begin{tikzpicture}
Madarasi@7
  1469
\begin{axis}[title={Random ISO, $\delta = 10$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1470
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1471
  west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7
  1472
  format/1000 sep = \space}]
alpar@2
  1473
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1474
\addplot table {randGraph/iso/vf2pIso10_1.txt};
alpar@3
  1475
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1476
        {randGraph/iso/vf2ppIso10_1.txt};
alpar@2
  1477
\end{axis}
alpar@2
  1478
\end{tikzpicture}
alpar@2
  1479
\end{center}
Madarasi@7
  1480
\end{subfigure}
Madarasi@7
  1481
%%\hspace{1cm}
Madarasi@7
  1482
\hspace*{-1.5cm}
Madarasi@7
  1483
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1484
\begin{center}
alpar@2
  1485
\begin{tikzpicture}
Madarasi@7
  1486
\begin{axis}[title={Random ISO, $\delta = 15$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1487
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1488
  west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7
  1489
  format/1000 sep = \space}]
alpar@2
  1490
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1491
\addplot table {randGraph/iso/vf2pIso15_1.txt};
alpar@3
  1492
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1493
        {randGraph/iso/vf2ppIso15_1.txt};
alpar@2
  1494
\end{axis}
alpar@2
  1495
\end{tikzpicture}
alpar@2
  1496
\end{center}
Madarasi@7
  1497
     \end{subfigure}
Madarasi@7
  1498
     \begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1499
\begin{center}
alpar@2
  1500
\begin{tikzpicture}
Madarasi@7
  1501
\begin{axis}[title={Random ISO, $\delta = 35$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1502
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1503
  west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7
  1504
  format/1000 sep = \space}]
alpar@2
  1505
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1506
\addplot table {randGraph/iso/vf2pIso35_1.txt};
alpar@3
  1507
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1508
        {randGraph/iso/vf2ppIso35_1.txt};
alpar@2
  1509
\end{axis}
alpar@2
  1510
\end{tikzpicture}
alpar@2
  1511
\end{center}
Madarasi@7
  1512
\end{subfigure}
Madarasi@7
  1513
\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7
  1514
\hspace*{-1.5cm}
alpar@2
  1515
\begin{tikzpicture}
Madarasi@7
  1516
\begin{axis}[title={Random ISO, $\delta = 45$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1517
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1518
  west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7
  1519
  format/1000 sep = \space}]
alpar@2
  1520
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1521
\addplot table {randGraph/iso/vf2pIso45_1.txt};
alpar@3
  1522
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1523
        {randGraph/iso/vf2ppIso45_1.txt};
alpar@2
  1524
\end{axis}
alpar@2
  1525
\end{tikzpicture}
Madarasi@7
  1526
\end{subfigure}
Madarasi@7
  1527
\hspace*{-1.5cm}
Madarasi@7
  1528
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1529
\begin{tikzpicture}
Madarasi@7
  1530
\begin{axis}[title={Random ISO, $\delta = 100$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1531
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1532
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1533
  format/1000 sep = \thinspace}]
alpar@2
  1534
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1535
\addplot table {randGraph/iso/vf2pIso100_1.txt};
alpar@3
  1536
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1537
        {randGraph/iso/vf2ppIso100_1.txt};
alpar@2
  1538
\end{axis}
alpar@2
  1539
\end{tikzpicture}
Madarasi@7
  1540
\end{subfigure}
alpar@2
  1541
\vspace*{-0.8cm}
Madarasi@7
  1542
\caption{IND on graphs having an average degree of
Madarasi@7
  1543
  5.}\label{fig:randISO}
alpar@2
  1544
\end{figure}
alpar@2
  1545
alpar@2
  1546
Madarasi@7
  1547
Madarasi@7
  1548
Madarasi@7
  1549
Madarasi@7
  1550
Madarasi@7
  1551
Madarasi@7
  1552
Madarasi@7
  1553
Madarasi@7
  1554
alpar@3
  1555
Considering the graph isomorphism problem, VF2++ consistently
alpar@3
  1556
outperforms its rival especially on sparse graphs. The reason for the
alpar@3
  1557
slightly super linear behaviour of VF2++ on denser graphs is the
alpar@3
  1558
larger number of nodes in the BFS tree constructed in
alpar@4
  1559
Algorithm~\ref{alg:VF2PPPseu}.
alpar@2
  1560
alpar@2
  1561
\subsubsection{Induced subgraph isomorphism}
alpar@3
  1562
This section provides a comparison of VF2++ and VF2 Plus in the case
alpar@3
  1563
of induced subgraph isomorphism. In addition to the size of the large
alpar@3
  1564
graph, that of the small graph dramatically influences the hardness of
alpar@3
  1565
a given problem too, so the overall picture is provided by examining
alpar@3
  1566
small graphs of various size.
alpar@2
  1567
alpar@3
  1568
For each chart, a number $0<\rho< 1$ has been fixed and the following
alpar@3
  1569
has been executed 150 times. Generating a large graph $G_{large}$,
alpar@3
  1570
choose 10 of its induced subgraphs having $\rho\ |V_{large}|$ nodes,
alpar@3
  1571
and for all the 10 subgraphs find a mapping by using both the graph
alpar@3
  1572
matching algorithms.  The $\delta = 5, 10, 35$ and $\rho = 0.05, 0.1,
alpar@4
  1573
0.3, 0.6, 0.8, 0.95$ cases have been examined (see
alpar@4
  1574
Figure~\ref{fig:randIND5}, \ref{fig:randIND10} and
alpar@4
  1575
\ref{fig:randIND35}, and for each $\delta$, a cumulative chart is
alpar@4
  1576
given as well, which excludes $\rho = 0.05$ and $0.1$ for the sake of
alpar@4
  1577
perspicuity (see Figure~\ref{fig:randIND5Sum}, \ref{fig:randIND10Sum}
alpar@4
  1578
and \ref{fig:randIND35Sum}).
alpar@2
  1579
alpar@2
  1580
alpar@2
  1581
alpar@2
  1582
alpar@2
  1583
alpar@2
  1584
\begin{figure}[H]
Madarasi@7
  1585
\vspace*{-1.5cm}
Madarasi@7
  1586
\hspace*{-1.5cm}
alpar@2
  1587
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1588
\begin{center}
alpar@2
  1589
\begin{tikzpicture}
alpar@2
  1590
\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1591
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1592
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1593
  format/1000 sep = \space}]
alpar@2
  1594
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1595
\addplot table {randGraph/ind/vf2pInd5_0.05.txt};
alpar@3
  1596
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1597
        {randGraph/ind/vf2ppInd5_0.05.txt};
alpar@2
  1598
\end{axis}
alpar@2
  1599
\end{tikzpicture}
alpar@2
  1600
\end{center}
alpar@2
  1601
     \end{subfigure}
alpar@2
  1602
     \begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1603
\begin{center}
alpar@2
  1604
\begin{tikzpicture}
alpar@2
  1605
\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1606
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1607
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1608
  format/1000 sep = \space}]
alpar@2
  1609
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1610
\addplot table {randGraph/ind/vf2pInd5_0.1.txt};
alpar@3
  1611
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1612
        {randGraph/ind/vf2ppInd5_0.1.txt};
alpar@2
  1613
\end{axis}
alpar@2
  1614
\end{tikzpicture}
alpar@2
  1615
\end{center}
alpar@2
  1616
\end{subfigure}
Madarasi@7
  1617
\hspace*{-1.5cm}
alpar@2
  1618
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1619
\begin{center}
alpar@2
  1620
\begin{tikzpicture}
alpar@2
  1621
\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1622
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1623
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1624
  format/1000 sep = \space}]
alpar@2
  1625
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1626
\addplot table {randGraph/ind/vf2pInd5_0.3.txt};
alpar@3
  1627
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1628
        {randGraph/ind/vf2ppInd5_0.3.txt};
alpar@2
  1629
\end{axis}
alpar@2
  1630
\end{tikzpicture}
alpar@2
  1631
\end{center}
alpar@2
  1632
     \end{subfigure}
alpar@2
  1633
     \begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1634
\begin{center}
alpar@2
  1635
\begin{tikzpicture}
alpar@2
  1636
\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.6$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1637
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1638
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1639
  format/1000 sep = \space}]
alpar@2
  1640
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1641
\addplot table {randGraph/ind/vf2pInd5_0.6.txt};
alpar@3
  1642
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1643
        {randGraph/ind/vf2ppInd5_0.6.txt};
alpar@2
  1644
\end{axis}
alpar@2
  1645
\end{tikzpicture}
alpar@2
  1646
\end{center}
alpar@2
  1647
\end{subfigure}
alpar@2
  1648
\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7
  1649
\hspace*{-1.5cm}
alpar@2
  1650
\begin{tikzpicture}
alpar@2
  1651
\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1652
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1653
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1654
  format/1000 sep = \space}]
alpar@2
  1655
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1656
\addplot table {randGraph/ind/vf2pInd5_0.8.txt};
alpar@3
  1657
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1658
        {randGraph/ind/vf2ppInd5_0.8.txt};
alpar@2
  1659
\end{axis}
alpar@2
  1660
\end{tikzpicture}
alpar@2
  1661
     \end{subfigure}
Madarasi@7
  1662
     \hspace*{-1.5cm}
alpar@2
  1663
     \begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1664
\begin{tikzpicture}
alpar@2
  1665
\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.95$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1666
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1667
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1668
  format/1000 sep = \thinspace}]
alpar@2
  1669
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1670
\addplot table {randGraph/ind/vf2pInd5_0.95.txt};
alpar@3
  1671
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1672
        {randGraph/ind/vf2ppInd5_0.95.txt};
alpar@2
  1673
\end{axis}
alpar@2
  1674
\end{tikzpicture}
alpar@2
  1675
\end{subfigure}
alpar@2
  1676
\vspace*{-0.8cm}
alpar@3
  1677
\caption{IND on graphs having an average degree of
alpar@3
  1678
  5.}\label{fig:randIND5}
alpar@2
  1679
\end{figure}
alpar@2
  1680
alpar@2
  1681
\begin{figure}[H]
alpar@2
  1682
\begin{center}
Madarasi@7
  1683
\hspace*{-2cm}
alpar@2
  1684
\begin{tikzpicture}
alpar@2
  1685
\begin{axis}[title={Rand IND Summary, $\delta = 5$, $\rho = 0.3, 0.6, 0.8, 0.95$},height=17cm,width=16cm,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},line width=0.8pt,grid
alpar@3
  1686
=major,mark size=1pt, legend style={at={(0,1)},anchor=north
alpar@3
  1687
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1688
  format/1000 sep = \thinspace}]
alpar@2
  1689
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@3
  1690
\addplot[mark=*,mark size=1.5pt,color=blue] table
alpar@3
  1691
        {randGraph/ind/vf2pInd5_0.3.txt}; \addplot[mark=triangle*,mark
alpar@3
  1692
          size=1.8pt,color=red] table
alpar@3
  1693
        {randGraph/ind/vf2ppInd5_0.3.txt}; \addplot[mark=*,mark
alpar@3
  1694
          size=1.5pt,color=blue] table
alpar@3
  1695
        {randGraph/ind/vf2pInd5_0.6.txt}; \addplot[mark=triangle*,mark
alpar@3
  1696
          size=1.8pt,color=red] table
alpar@3
  1697
        {randGraph/ind/vf2ppInd5_0.6.txt}; \addplot[mark=*,mark
alpar@3
  1698
          size=1.5pt,color=blue] table
alpar@3
  1699
        {randGraph/ind/vf2pInd5_0.8.txt}; \addplot[mark=triangle*,mark
alpar@3
  1700
          size=1.8pt,color=red] table
alpar@3
  1701
        {randGraph/ind/vf2ppInd5_0.8.txt}; \addplot[mark=*,mark
alpar@3
  1702
          size=1.5pt,color=blue] table
alpar@3
  1703
        {randGraph/ind/vf2pInd5_0.95.txt};
alpar@3
  1704
        \addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1705
                {randGraph/ind/vf2ppInd5_0.95.txt};
alpar@2
  1706
\end{axis}
alpar@2
  1707
\end{tikzpicture}
alpar@2
  1708
\end{center}
alpar@2
  1709
\vspace*{-0.8cm}
alpar@2
  1710
\caption{Cummulative chart for $\delta=5$.}\label{fig:randIND5Sum}
alpar@2
  1711
\end{figure}
alpar@2
  1712
alpar@2
  1713
alpar@2
  1714
alpar@2
  1715
\begin{figure}[H]
Madarasi@7
  1716
\vspace*{-1.5cm}
Madarasi@7
  1717
\hspace*{-1.5cm}
alpar@2
  1718
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1719
\begin{center}
Madarasi@7
  1720
\hspace*{-0.5cm}
alpar@2
  1721
\begin{tikzpicture}
alpar@2
  1722
\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1723
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1724
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1725
  format/1000 sep = \space}]
alpar@2
  1726
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1727
\addplot table {randGraph/ind/vf2pInd10_0.05.txt};
alpar@3
  1728
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1729
        {randGraph/ind/vf2ppInd10_0.05.txt};
alpar@2
  1730
\end{axis}
alpar@2
  1731
\end{tikzpicture}
alpar@2
  1732
\end{center}
alpar@2
  1733
     \end{subfigure}
alpar@2
  1734
     \begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1735
\begin{center}
Madarasi@7
  1736
     \hspace*{-0.5cm}
alpar@2
  1737
\begin{tikzpicture}
alpar@2
  1738
\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1739
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1740
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1741
  format/1000 sep = \space}]
alpar@2
  1742
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1743
\addplot table {randGraph/ind/vf2pInd10_0.1.txt};
alpar@3
  1744
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1745
        {randGraph/ind/vf2ppInd10_0.1.txt};
alpar@2
  1746
\end{axis}
alpar@2
  1747
\end{tikzpicture}
alpar@2
  1748
\end{center}
alpar@2
  1749
\end{subfigure}
Madarasi@7
  1750
\hspace*{-1.5cm}
alpar@2
  1751
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1752
\begin{center}
alpar@2
  1753
\begin{tikzpicture}
alpar@2
  1754
\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1755
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1756
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1757
  format/1000 sep = \space}]
alpar@2
  1758
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1759
\addplot table {randGraph/ind/vf2pInd10_0.3.txt};
alpar@3
  1760
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1761
        {randGraph/ind/vf2ppInd10_0.3.txt};
alpar@2
  1762
\end{axis}
alpar@2
  1763
\end{tikzpicture}
alpar@2
  1764
\end{center}
alpar@2
  1765
     \end{subfigure}
alpar@2
  1766
     \begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1767
\begin{center}
alpar@2
  1768
\begin{tikzpicture}
alpar@2
  1769
\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.6$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1770
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1771
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1772
  format/1000 sep = \space}]
alpar@2
  1773
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1774
\addplot table {randGraph/ind/vf2pInd10_0.6.txt};
alpar@3
  1775
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1776
        {randGraph/ind/vf2ppInd10_0.6.txt};
alpar@2
  1777
\end{axis}
alpar@2
  1778
\end{tikzpicture}
alpar@2
  1779
\end{center}
alpar@2
  1780
\end{subfigure}
Madarasi@7
  1781
\hspace*{-1.5cm}
alpar@2
  1782
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1783
\begin{tikzpicture}
alpar@2
  1784
\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1785
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1786
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1787
  format/1000 sep = \space}]
alpar@2
  1788
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1789
\addplot table {randGraph/ind/vf2pInd10_0.8.txt};
alpar@3
  1790
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1791
        {randGraph/ind/vf2ppInd10_0.8.txt};
alpar@2
  1792
\end{axis}
alpar@2
  1793
\end{tikzpicture}
alpar@2
  1794
     \end{subfigure}
alpar@2
  1795
     \begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1796
\begin{tikzpicture}
alpar@2
  1797
\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.95$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1798
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1799
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1800
  format/1000 sep = \thinspace}]
alpar@2
  1801
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1802
\addplot table {randGraph/ind/vf2pInd10_0.95.txt};
alpar@3
  1803
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1804
        {randGraph/ind/vf2ppInd10_0.95.txt};
alpar@2
  1805
\end{axis}
alpar@2
  1806
\end{tikzpicture}
alpar@2
  1807
\end{subfigure}
alpar@2
  1808
\vspace*{-0.8cm}
alpar@3
  1809
\caption{IND on graphs having an average degree of
alpar@3
  1810
  10.}\label{fig:randIND10}
alpar@2
  1811
\end{figure}
alpar@2
  1812
alpar@2
  1813
\begin{figure}[H]
alpar@2
  1814
\begin{center}
Madarasi@7
  1815
\hspace*{-2cm}
alpar@2
  1816
\begin{tikzpicture}
alpar@2
  1817
\begin{axis}[title={Rand IND Summary, $\delta = 10$, $\rho = 0.3, 0.6, 0.8, 0.95$},height=17cm,width=16cm,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},line width=0.8pt,grid
alpar@3
  1818
=major,mark size=1pt, legend style={at={(0,1)},anchor=north
alpar@3
  1819
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1820
  format/1000 sep = \thinspace}]
alpar@2
  1821
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@3
  1822
\addplot[mark=*,mark size=1.5pt,color=blue] table
alpar@3
  1823
        {randGraph/ind/vf2pInd10_0.3.txt};
alpar@3
  1824
        \addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1825
                {randGraph/ind/vf2ppInd10_0.3.txt};
alpar@3
  1826
                \addplot[mark=*,mark size=1.5pt,color=blue] table
alpar@3
  1827
                        {randGraph/ind/vf2pInd10_0.6.txt};
alpar@3
  1828
                        \addplot[mark=triangle*,mark
alpar@3
  1829
                          size=1.8pt,color=red] table
alpar@3
  1830
                                {randGraph/ind/vf2ppInd10_0.6.txt};
alpar@3
  1831
                                \addplot[mark=*,mark
alpar@3
  1832
                                  size=1.5pt,color=blue] table
alpar@3
  1833
                                        {randGraph/ind/vf2pInd10_0.8.txt};
alpar@3
  1834
                                        \addplot[mark=triangle*,mark
alpar@3
  1835
                                          size=1.8pt,color=red] table
alpar@3
  1836
                                                {randGraph/ind/vf2ppInd10_0.8.txt};
alpar@3
  1837
                                                \addplot[mark=*,mark
alpar@3
  1838
                                                  size=1.5pt,color=blue]
alpar@3
  1839
                                                table
alpar@3
  1840
                                                {randGraph/ind/vf2pInd10_0.95.txt};
alpar@3
  1841
                                                \addplot[mark=triangle*,mark
alpar@3
  1842
                                                  size=1.8pt,color=red]
alpar@3
  1843
                                                table
alpar@3
  1844
                                                {randGraph/ind/vf2ppInd10_0.95.txt};
alpar@2
  1845
\end{axis}
alpar@2
  1846
\end{tikzpicture}
alpar@2
  1847
\end{center}
alpar@2
  1848
\vspace*{-0.8cm}
alpar@2
  1849
\caption{Cummulative chart for $\delta=10$.}\label{fig:randIND10Sum}
alpar@2
  1850
\end{figure}
alpar@2
  1851
alpar@2
  1852
alpar@2
  1853
alpar@2
  1854
\begin{figure}[H]
Madarasi@7
  1855
\vspace*{-1.5cm}
Madarasi@7
  1856
\hspace*{-1.5cm}
alpar@2
  1857
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1858
\begin{center}
alpar@2
  1859
\begin{tikzpicture}
alpar@2
  1860
\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1861
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1862
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1863
  format/1000 sep = \space}]
alpar@2
  1864
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1865
\addplot table {randGraph/ind/vf2pInd35_0.05.txt};
alpar@3
  1866
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1867
        {randGraph/ind/vf2ppInd35_0.05.txt};
alpar@2
  1868
\end{axis}
alpar@2
  1869
\end{tikzpicture}
alpar@2
  1870
\end{center}
alpar@2
  1871
     \end{subfigure}
alpar@2
  1872
     \begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1873
\begin{center}
alpar@2
  1874
\begin{tikzpicture}
alpar@2
  1875
\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1876
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1877
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1878
  format/1000 sep = \space}]
alpar@2
  1879
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1880
\addplot table {randGraph/ind/vf2pInd35_0.1.txt};
alpar@3
  1881
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1882
        {randGraph/ind/vf2ppInd35_0.1.txt};
alpar@2
  1883
\end{axis}
alpar@2
  1884
\end{tikzpicture}
alpar@2
  1885
\end{center}
alpar@2
  1886
\end{subfigure}
Madarasi@7
  1887
\hspace*{-1.5cm}
alpar@2
  1888
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1889
\begin{center}
alpar@2
  1890
\begin{tikzpicture}
alpar@2
  1891
\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1892
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1893
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1894
  format/1000 sep = \space}]
alpar@2
  1895
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1896
\addplot table {randGraph/ind/vf2pInd35_0.3.txt};
alpar@3
  1897
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1898
        {randGraph/ind/vf2ppInd35_0.3.txt};
alpar@2
  1899
\end{axis}
alpar@2
  1900
\end{tikzpicture}
alpar@2
  1901
\end{center}
alpar@2
  1902
     \end{subfigure}
alpar@2
  1903
     \begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1904
\begin{center}
alpar@2
  1905
\begin{tikzpicture}
alpar@2
  1906
\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.6$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1907
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1908
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1909
  format/1000 sep = \space}]
alpar@2
  1910
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1911
\addplot table {randGraph/ind/vf2pInd35_0.6.txt};
alpar@3
  1912
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1913
        {randGraph/ind/vf2ppInd35_0.6.txt};
alpar@2
  1914
\end{axis}
alpar@2
  1915
\end{tikzpicture}
alpar@2
  1916
\end{center}
alpar@2
  1917
\end{subfigure}
Madarasi@7
  1918
\hspace*{-1.5cm}
alpar@2
  1919
\begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1920
\begin{tikzpicture}
alpar@2
  1921
\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1922
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1923
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1924
  format/1000 sep = \space}]
alpar@2
  1925
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1926
\addplot table {randGraph/ind/vf2pInd35_0.8.txt};
alpar@3
  1927
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1928
        {randGraph/ind/vf2ppInd35_0.8.txt};
alpar@2
  1929
\end{axis}
alpar@2
  1930
\end{tikzpicture}
alpar@2
  1931
     \end{subfigure}
alpar@2
  1932
     \begin{subfigure}[b]{0.55\textwidth}
alpar@2
  1933
\begin{tikzpicture}
alpar@2
  1934
\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.95$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3
  1935
=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3
  1936
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1937
  format/1000 sep = \thinspace}]
alpar@2
  1938
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2
  1939
\addplot table {randGraph/ind/vf2pInd35_0.95.txt};
alpar@3
  1940
\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1941
        {randGraph/ind/vf2ppInd35_0.95.txt};
alpar@2
  1942
\end{axis}
alpar@2
  1943
\end{tikzpicture}
alpar@2
  1944
\end{subfigure}
alpar@2
  1945
\vspace*{-0.8cm}
alpar@3
  1946
\caption{IND on graphs having an average degree of
alpar@3
  1947
  35.}\label{fig:randIND35}
alpar@2
  1948
\end{figure}
alpar@2
  1949
alpar@2
  1950
\begin{figure}[H]
alpar@2
  1951
\begin{center}
Madarasi@7
  1952
\hspace*{-2cm}
alpar@2
  1953
\begin{tikzpicture}
alpar@2
  1954
\begin{axis}[title={Rand IND Summary, $\delta = 35$, $\rho = 0.3, 0.6, 0.8, 0.95$},height=17cm,width=16cm,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},line width=0.8pt,grid
alpar@3
  1955
=major,mark size=1pt, legend style={at={(0,1)},anchor=north
alpar@3
  1956
  west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3
  1957
  format/1000 sep = \thinspace}]
alpar@2
  1958
%\addplot+[only marks] table {proteinsOrig.txt};
alpar@3
  1959
\addplot[mark=*,mark size=1.5pt,color=blue] table
alpar@3
  1960
        {randGraph/ind/vf2pInd35_0.3.txt};
alpar@3
  1961
        \addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3
  1962
                {randGraph/ind/vf2ppInd35_0.3.txt};
alpar@3
  1963
                \addplot[mark=*,mark size=1.5pt,color=blue] table
alpar@3
  1964
                        {randGraph/ind/vf2pInd35_0.6.txt};
alpar@3
  1965
                        \addplot[mark=triangle*,mark
alpar@3
  1966
                          size=1.8pt,color=red] table
alpar@3
  1967
                                {randGraph/ind/vf2ppInd35_0.6.txt};
alpar@3
  1968
                                \addplot[mark=*,mark
alpar@3
  1969
                                  size=1.5pt,color=blue] table
alpar@3
  1970
                                        {randGraph/ind/vf2pInd35_0.8.txt};
alpar@3
  1971
                                        \addplot[mark=triangle*,mark
alpar@3
  1972
                                          size=1.8pt,color=red] table
alpar@3
  1973
                                                {randGraph/ind/vf2ppInd35_0.8.txt};
alpar@3
  1974
                                                \addplot[mark=*,mark
alpar@3
  1975
                                                  size=1.5pt,color=blue]
alpar@3
  1976
                                                table
alpar@3
  1977
                                                {randGraph/ind/vf2pInd35_0.95.txt};
alpar@3
  1978
                                                \addplot[mark=triangle*,mark
alpar@3
  1979
                                                  size=1.8pt,color=red]
alpar@3
  1980
                                                table
alpar@3
  1981
                                                {randGraph/ind/vf2ppInd35_0.95.txt};
alpar@2
  1982
\end{axis}
alpar@2
  1983
\end{tikzpicture}
alpar@2
  1984
\end{center}
alpar@2
  1985
\vspace*{-0.8cm}
alpar@2
  1986
\caption{Cummulative chart for $\delta=35$.}\label{fig:randIND35Sum}
alpar@2
  1987
\end{figure}
alpar@2
  1988
alpar@3
  1989
Based on these experiments, VF2++ is faster than VF2 Plus and able to
alpar@3
  1990
handle really large graphs in milliseconds. Note that when $IND$ was
alpar@3
  1991
considered and the small graphs had proportionally few nodes ($\rho =
alpar@3
  1992
0.05$, or $\rho = 0.1$), then VF2 Plus produced some inefficient node
alpar@4
  1993
orders (e.g. see the $\delta=10$ case on
alpar@4
  1994
Figure~\ref{fig:randIND10}). If these examples had been excluded, the
alpar@3
  1995
charts would have seemed to be similar to the other ones.
alpar@3
  1996
Unsurprisingly, as denser graphs are considered, both VF2++ and VF2
alpar@3
  1997
Plus slow slightly down, but remain practically usable even on graphs
alpar@3
  1998
having 10 000 nodes.
alpar@2
  1999
alpar@2
  2000
alpar@2
  2001
alpar@2
  2002
alpar@3
  2003
alpar@2
  2004
\section{Conclusion}
alpar@3
  2005
In this paper, after providing a short summary of the recent
alpar@3
  2006
algorithms, a new graph matching algorithm based on VF2, called VF2++,
alpar@3
  2007
has been presented and analyzed from a practical viewpoint.
alpar@2
  2008
alpar@3
  2009
Recognizing the importance of the node order and determining an
alpar@3
  2010
efficient one, VF2++ is able to match graphs of thousands of nodes in
alpar@3
  2011
near practically linear time including preprocessing. In addition to
alpar@3
  2012
the proper order, VF2++ uses more efficient consistency and cutting
alpar@3
  2013
rules which are easy to compute and make the algorithm able to prune
alpar@3
  2014
most of the unfruitful branches without going astray.
alpar@2
  2015
alpar@3
  2016
In order to show the efficiency of the new method, it has been
alpar@3
  2017
compared to VF2 Plus, which is the best concurrent algorithm based on
alpar@3
  2018
\cite{VF2Plus}.
alpar@2
  2019
alpar@3
  2020
The experiments show that VF2++ consistently outperforms VF2 Plus on
alpar@3
  2021
biological graphs. It seems to be asymptotically faster on protein and
alpar@3
  2022
on contact map graphs in the case of induced subgraph isomorphism,
alpar@3
  2023
while in the case of graph isomorphism, it has definitely better
alpar@3
  2024
asymptotic behaviour on protein graphs.
alpar@2
  2025
alpar@3
  2026
Regarding random sparse graphs, not only has VF2++ proved itself to be
alpar@3
  2027
faster than VF2 Plus, but it has a practically linear behaviour both
alpar@3
  2028
in the case of induced subgraph- and graph isomorphism, as well.
alpar@2
  2029
alpar@2
  2030
alpar@0
  2031
alpar@0
  2032
%% The Appendices part is started with the command \appendix;
alpar@0
  2033
%% appendix sections are then done as normal sections
alpar@0
  2034
%% \appendix
alpar@0
  2035
alpar@0
  2036
%% \section{}
alpar@0
  2037
%% \label{}
alpar@0
  2038
alpar@0
  2039
%% If you have bibdatabase file and want bibtex to generate the
alpar@0
  2040
%% bibitems, please use
alpar@0
  2041
%%
alpar@3
  2042
\bibliographystyle{elsarticle-num} \bibliography{bibliography}
alpar@0
  2043
alpar@0
  2044
%% else use the following coding to input the bibitems directly in the
alpar@0
  2045
%% TeX file.
alpar@0
  2046
alpar@2
  2047
%% \begin{thebibliography}{00}
alpar@0
  2048
alpar@2
  2049
%% %% \bibitem{label}
alpar@2
  2050
%% %% Text of bibliographic item
alpar@0
  2051
alpar@2
  2052
%% \bibitem{}
alpar@0
  2053
alpar@2
  2054
%% \end{thebibliography}
alpar@2
  2055
alpar@0
  2056
\end{document}
alpar@0
  2057
\endinput
alpar@0
  2058
%%
alpar@0
  2059
%% End of file `elsarticle-template-num.tex'.