lemon-vf2: damecco.tex@4989d677d5e3 (annotated)

alpar@0	1	%%
alpar@0	2	%% Copyright 2007, 2008, 2009 Elsevier Ltd
alpar@0	3	%%
alpar@0	4	%% This file is part of the 'Elsarticle Bundle'.
alpar@0	5	%% ---------------------------------------------
alpar@0	6	%%
alpar@0	7	%% It may be distributed under the conditions of the LaTeX Project Public
alpar@0	8	%% License, either version 1.2 of this license or (at your option) any
alpar@0	9	%% later version. The latest version of this license is in
alpar@0	10	%% http://www.latex-project.org/lppl.txt
alpar@0	11	%% and version 1.2 or later is part of all distributions of LaTeX
alpar@0	12	%% version 1999/12/01 or later.
alpar@0	13	%%
alpar@0	14	%% The list of all files belonging to the 'Elsarticle Bundle' is
alpar@0	15	%% given in the file `manifest.txt'.
alpar@0	16	%%
alpar@0	17
alpar@0	18	%% Template article for Elsevier's document class `elsarticle'
alpar@0	19	%% with numbered style bibliographic references
alpar@0	20	%% SP 2008/03/01
alpar@0	21
alpar@0	22	\documentclass[preprint,12pt]{elsarticle}
alpar@0	23
alpar@0	24	%% Use the option review to obtain double line spacing
alpar@0	25	%% \documentclass[authoryear,preprint,review,12pt]{elsarticle}
alpar@0	26
alpar@0	27	%% Use the options 1p,twocolumn; 3p; 3p,twocolumn; 5p; or 5p,twocolumn
alpar@0	28	%% for a journal layout:
alpar@0	29	%% \documentclass[final,1p,times]{elsarticle}
alpar@0	30	%% \documentclass[final,1p,times,twocolumn]{elsarticle}
alpar@0	31	%% \documentclass[final,3p,times]{elsarticle}
alpar@0	32	%% \documentclass[final,3p,times,twocolumn]{elsarticle}
alpar@0	33	%% \documentclass[final,5p,times]{elsarticle}
alpar@0	34	%% \documentclass[final,5p,times,twocolumn]{elsarticle}
alpar@0	35
alpar@0	36	%% For including figures, graphicx.sty has been loaded in
alpar@0	37	%% elsarticle.cls. If you prefer to use the old commands
alpar@0	38	%% please give \usepackage{epsfig}
alpar@0	39
alpar@0	40	%% The amssymb package provides various useful mathematical symbols
alpar@0	41	\usepackage{amssymb}
alpar@0	42	%% The amsthm package provides extended theorem environments
alpar@0	43	%% \usepackage{amsthm}
alpar@0	44
alpar@0	45	%% The lineno packages adds line numbers. Start line numbering with
alpar@0	46	%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on
alpar@0	47	%% for the whole article with \linenumbers.
alpar@0	48	%% \usepackage{lineno}
alpar@0	49
alpar@2	50	\usepackage{amsmath}
alpar@2	51	%% \usepackage[pdftex]{graphicx}
alpar@2	52
alpar@2	53	\usepackage{pgfplots}
alpar@2	54	\pgfplotsset{width=9cm}
alpar@2	55	\pgfplotsset{compat=1.8}
alpar@2	56
alpar@2	57	\usepackage{caption}
alpar@2	58	\usepackage{subcaption}
alpar@2	59
alpar@2	60	\usepackage{algorithm}
alpar@2	61	\usepackage{algpseudocode}
alpar@2	62	\usepackage{tikz}
alpar@2	63
alpar@2	64	\usepackage{amsthm,amssymb}
alpar@2	65	\renewcommand{\qedsymbol}{\rule{0.7em}{0.7em}}
alpar@2	66
alpar@2	67	\newtheorem{theorem}{Theorem}[subsection]
alpar@2	68	\newtheorem{corollary}{Corollary}[theorem]
alpar@2	69	\newtheorem{claim}[theorem]{Claim}
alpar@2	70
alpar@2	71	\newtheorem{definition}{Definition}[subsection]
alpar@2	72	\newtheorem{notation}{Notation}[subsection]
alpar@2	73	\newtheorem{example}{Example}[subsection]
alpar@2	74	\usetikzlibrary{decorations.markings}
alpar@2	75	\let\oldproofname=\proofname
alpar@2	76	%% \renewcommand{\proofname}{\rm\bf{Proof:}}
alpar@2	77
Madarasi@7	78	\captionsetup{font=normalsize}
Madarasi@7	79
alpar@1	80	\journal{Discrete Applied Mathematics}
alpar@0	81
alpar@0	82	\begin{document}
alpar@0	83
alpar@0	84	\begin{frontmatter}
alpar@0	85
alpar@0	86	%% Title, authors and addresses
alpar@0	87
alpar@0	88	%% use the tnoteref command within \title for footnotes;
alpar@0	89	%% use the tnotetext command for theassociated footnote;
alpar@0	90	%% use the fnref command within \author or \address for footnotes;
alpar@0	91	%% use the fntext command for theassociated footnote;
alpar@0	92	%% use the corref command within \author for corresponding author footnotes;
alpar@0	93	%% use the cortext command for theassociated footnote;
alpar@0	94	%% use the ead command for the email address,
alpar@0	95	%% and the form \ead[url] for the home page:
alpar@0	96	%% \title{Title\tnoteref{label1}}
alpar@0	97	%% \tnotetext[label1]{}
alpar@0	98	%% \author{Name\corref{cor1}\fnref{label2}}
alpar@0	99	%% \ead{email address}
alpar@0	100	%% \ead[url]{home page}
alpar@0	101	%% \fntext[label2]{}
alpar@0	102	%% \cortext[cor1]{}
alpar@0	103	%% \address{Address\fnref{label3}}
alpar@0	104	%% \fntext[label3]{}
alpar@0	105
alpar@1	106	\title{Improved Algorithms for Matching Biological Graphs}
alpar@0	107
alpar@0	108	%% use optional labels to link authors explicitly to addresses:
alpar@0	109	%% \author[label1,label2]{}
alpar@0	110	%% \address[label1]{}
alpar@0	111	%% \address[label2]{}
alpar@0	112
alpar@1	113	\author{Alp{\'a}r J{\"u}ttner and P{\'e}ter Madarasi}
alpar@0	114
alpar@1	115	\address{Dept of Operations Research, ELTE}
alpar@0	116
alpar@0	117	\begin{abstract}
alpar@1	118	Subgraph isomorphism is a well-known NP-Complete problem, while its
alpar@1	119	special case, the graph isomorphism problem is one of the few problems
alpar@1	120	in NP neither known to be in P nor NP-Complete. Their appearance in
alpar@1	121	many fields of application such as pattern analysis, computer vision
alpar@1	122	questions and the analysis of chemical and biological systems has
alpar@1	123	fostered the design of various algorithms for handling special graph
alpar@1	124	structures.
alpar@0	125
alpar@1	126	The idea of using state space representation and checking some
alpar@1	127	conditions in each state to prune the search tree has made the VF2
alpar@1	128	algorithm one of the state of the art graph matching algorithms for
alpar@1	129	more than a decade. Recently, biological questions of ever increasing
alpar@1	130	importance have required more efficient, specialized algorithms.
alpar@1	131
alpar@1	132	This paper presents VF2++, a new algorithm based on the original VF2,
alpar@1	133	which runs significantly faster on most test cases and performs
alpar@1	134	especially well on special graph classes stemming from biological
alpar@1	135	questions. VF2++ handles graphs of thousands of nodes in practically
alpar@1	136	near linear time including preprocessing. Not only is it an improved
alpar@1	137	version of VF2, but in fact, it is by far the fastest existing
alpar@1	138	algorithm regarding biological graphs.
alpar@1	139
alpar@1	140	The reason for VF2++' superiority over VF2 is twofold. Firstly, taking
alpar@1	141	into account the structure and the node labeling of the graph, VF2++
alpar@1	142	determines a state order in which most of the unfruitful branches of
alpar@1	143	the search space can be pruned immediately. Secondly, introducing more
alpar@1	144	efficient - nevertheless still easier to compute - cutting rules
alpar@1	145	reduces the chance of going astray even further.
alpar@1	146
alpar@1	147	In addition to the usual subgraph isomorphism, specialized versions
alpar@1	148	for induced subgraph isomorphism and for graph isomorphism are
alpar@1	149	presented. VF2++ has gained a runtime improvement of one order of
alpar@1	150	magnitude respecting induced subgraph isomorphism and a better
alpar@1	151	asymptotical behaviour in the case of graph isomorphism problem.
alpar@1	152
alpar@1	153	After having provided the description of VF2++, in order to evaluate
alpar@1	154	its effectiveness, an extensive comparison to the contemporary other
alpar@1	155	algorithms is shown, using a wide range of inputs, including both real
alpar@1	156	life biological and chemical datasets and standard randomly generated
alpar@1	157	graph series.
alpar@1	158
alpar@1	159	The work was motivated and sponsored by QuantumBio Inc., and all the
alpar@1	160	developed algorithms are available as the part of the open source
alpar@1	161	LEMON graph and network optimization library
alpar@1	162	(http://lemon.cs.elte.hu).
alpar@0	163	\end{abstract}
alpar@0	164
alpar@0	165	\begin{keyword}
alpar@0	166	%% keywords here, in the form: keyword \sep keyword
alpar@0	167
alpar@0	168	%% PACS codes here, in the form: \PACS code \sep code
alpar@0	169
alpar@0	170	%% MSC codes here, in the form: \MSC code \sep code
alpar@0	171	%% or \MSC[2008] code \sep code (2000 is the default)
alpar@0	172
alpar@0	173	\end{keyword}
alpar@0	174
alpar@0	175	\end{frontmatter}
alpar@0	176
alpar@0	177	%% \linenumbers
alpar@0	178
alpar@0	179	%% main text
alpar@2	180	\section{Introduction}
alpar@2	181	\label{sec:intro}
alpar@2	182
alpar@3	183	In the last decades, combinatorial structures, and especially graphs
alpar@3	184	have been considered with ever increasing interest, and applied to the
alpar@3	185	solution of several new and revised questions. The expressiveness,
alpar@3	186	the simplicity and the studiedness of graphs make them practical for
alpar@3	187	modelling and appear constantly in several seemingly independent
alpar@3	188	fields. Bioinformatics and chemistry are amongst the most relevant
alpar@3	189	and most important fields.
alpar@2	190
alpar@3	191	Complex biological systems arise from the interaction and cooperation
alpar@3	192	of plenty of molecular components. Getting acquainted with such
alpar@3	193	systems at the molecular level has primary importance, since
alpar@3	194	protein-protein interaction, DNA-protein interaction, metabolic
alpar@3	195	interaction, transcription factor binding, neuronal networks, and
alpar@3	196	hormone signaling networks can be understood only this way.
alpar@2	197
alpar@3	198	For instance, a molecular structure can be considered as a graph,
alpar@3	199	whose nodes correspond to atoms and whose edges to chemical bonds. The
alpar@3	200	secondary structure of a protein can also be represented as a graph,
alpar@3	201	where nodes are associated with aminoacids and the edges with hydrogen
alpar@3	202	bonds. The nodes are often whole molecular components and the edges
alpar@3	203	represent some relationships among them. The similarity and
alpar@3	204	dissimilarity of objects corresponding to nodes are incorporated to
alpar@3	205	the model by \emph{node labels}. Many other chemical and biological
alpar@3	206	structures can easily be modeled in a similar way. Understanding such
alpar@3	207	networks basically requires finding specific subgraphs, which can not
alpar@3	208	avoid the application of graph matching algorithms.
alpar@2	209
alpar@3	210	Finally, let some of the other real-world fields related to some
alpar@3	211	variants of graph matching be briefly mentioned: pattern recognition
alpar@3	212	and machine vision \cite{HorstBunkeApplications}, symbol recognition
alpar@3	213	\cite{CordellaVentoSymbolRecognition}, face identification
alpar@3	214	\cite{JianzhuangYongFaceIdentification}. \\
alpar@2	215
alpar@3	216	Subgraph and induced subgraph matching problems are known to be
alpar@3	217	NP-Complete\cite{SubgraphNPC}, while the graph isomorphism problem is
alpar@3	218	one of the few problems in NP neither known to be in P nor
alpar@3	219	NP-Complete. Although polynomial time isomorphism algorithms are known
alpar@3	220	for various graph classes, like trees and planar
alpar@3	221	graphs\cite{PlanarGraphIso}, bounded valence
alpar@3	222	graphs\cite{BondedDegGraphIso}, interval graphs\cite{IntervalGraphIso}
alpar@3	223	or permutation graphs\cite{PermGraphIso}.
alpar@2	224
alpar@3	225	In the following, some algorithms based on other approaches are
alpar@3	226	summarized, which do not need any restrictions on the graphs. However,
alpar@3	227	an overall polynomial behaviour is not expectable from such an
alpar@3	228	alternative, it may often have good performance, even on a graph class
alpar@3	229	for which polynomial algorithm is known. Note that this summary
alpar@3	230	containing only exact matching algorithms is far not complete, neither
alpar@3	231	does it cover all the recent algorithms.
alpar@2	232
alpar@3	233	The first practically usable approach was due to
alpar@4	234	Ullmann\cite{Ullmann} which is a commonly used depth-first
alpar@3	235	search based algorithm with a complex heuristic for reducing the
alpar@3	236	number of visited states. A major problem is its $\Theta(n^3)$ space
alpar@3	237	complexity, which makes it impractical in the case of big sparse
alpar@3	238	graphs.
alpar@2	239
alpar@4	240	In a recent paper, Ullmann\cite{UllmannBit} presents an
alpar@3	241	improved version of this algorithm based on a bit-vector solution for
alpar@3	242	the binary Constraint Satisfaction Problem.
alpar@2	243
alpar@4	244	The Nauty algorithm\cite{Nauty} transforms the two graphs to
alpar@3	245	a canonical form before starting to check for the isomorphism. It has
alpar@3	246	been considered as one of the fastest graph isomorphism algorithms,
alpar@3	247	although graph categories were shown in which it takes exponentially
alpar@3	248	many steps. This algorithm handles only the graph isomorphism problem.
alpar@2	249
alpar@4	250	The \emph{LAD} algorithm\cite{Lad} uses a depth-first search
alpar@3	251	strategy and formulates the matching as a Constraint Satisfaction
alpar@3	252	Problem to prune the search tree. The constraints are that the mapping
alpar@3	253	has to be injective and edge-preserving, hence it is possible to
alpar@3	254	handle new matching types as well.
alpar@2	255
alpar@3	256	The \textbf{RI} algorithm\cite{RI} and its variations are based on a
alpar@3	257	state space representation. After reordering the nodes of the graphs,
alpar@3	258	it uses some fast executable heuristic checks without using any
alpar@3	259	complex pruning rules. It seems to run really efficiently on graphs
alpar@3	260	coming from biology, and won the International Contest on Pattern
alpar@3	261	Search in Biological Databases\cite{Content}.
alpar@2	262
alpar@3	263	The currently most commonly used algorithm is the
alpar@3	264	\textbf{VF2}\cite{VF2}, the improved version of VF\cite{VF}, which was
alpar@3	265	designed for solving pattern matching and computer vision problems,
alpar@3	266	and has been one of the best overall algorithms for more than a
alpar@3	267	decade. Although, it can't be up to new specialized algorithms, it is
alpar@3	268	still widely used due to its simplicity and space efficiency. VF2 uses
alpar@3	269	a state space representation and checks some conditions in each state
alpar@3	270	to prune the search tree.
alpar@2	271
alpar@3	272	Our first graph matching algorithm was the first version of VF2 which
alpar@3	273	recognizes the significance of the node ordering, more opportunities
alpar@3	274	to increase the cutting efficiency and reduce its computational
alpar@3	275	complexity. This project was initiated and sponsored by QuantumBio
alpar@3	276	Inc.\cite{QUANTUMBIO} and the implementation --- along with a source
alpar@3	277	code --- has been published as a part of LEMON\cite{LEMON} open source
alpar@3	278	graph library.
alpar@2	279
alpar@3	280	This paper introduces \textbf{VF2++}, a new further improved algorithm
alpar@3	281	for the graph and (induced)subgraph isomorphism problem, which uses
alpar@3	282	efficient cutting rules and determines a node order in which VF2 runs
alpar@3	283	significantly faster on practical inputs.
alpar@2	284
alpar@3	285	Meanwhile, another variant called \textbf{VF2 Plus}\cite{VF2Plus} has
alpar@3	286	been published. It is considered to be as efficient as the RI
alpar@3	287	algorithm and has a strictly better behavior on large graphs. The
alpar@3	288	main idea of VF2 Plus is to precompute a heuristic node order of the
alpar@3	289	small graph, in which the VF2 works more efficiently.
alpar@2	290
alpar@2	291	\section{Problem Statement}
alpar@3	292	This section provides a detailed description of the problems to be
alpar@3	293	solved.
alpar@2	294	\subsection{Definitions}
alpar@2	295
alpar@3	296	Throughout the paper $G_{small}=(V_{small}, E_{small})$ and
alpar@3	297	$G_{large}=(V_{large}, E_{large})$ denote two undirected graphs.
alpar@2	298	\begin{definition}\label{sec:ismorphic}
alpar@3	299	$G_{small}$ and $G_{large}$ are \textbf{isomorphic} if $\exists M:
alpar@3	300	V_{small} \longrightarrow V_{large}$ bijection, for which the
alpar@3	301	following is true:
alpar@2	302	\begin{center}
alpar@3	303	$\forall u,v\in{V_{small}} : (u,v)\in{E_{small}} \Leftrightarrow
alpar@3	304	(M(u),M(v))\in{E_{large}}$
alpar@2	305	\end{center}
alpar@2	306	\end{definition}
alpar@3	307	For the sake of simplicity in this paper subgraphs and induced
alpar@3	308	subgraphs are defined in a more general way than usual:
alpar@2	309	\begin{definition}
alpar@3	310	$G_{small}$ is a \textbf{subgraph} of $G_{large}$ if $\exists I:
alpar@3	311	V_{small}\longrightarrow V_{large}$ injection, for which the
alpar@3	312	following is true:
alpar@2	313	\begin{center}
alpar@2	314	$\forall u,v \in{V_{small}} : (u,v)\in{E_{small}} \Rightarrow (I(u),I(v))\in E_{large}$
alpar@2	315	\end{center}
alpar@2	316	\end{definition}
alpar@2	317
alpar@2	318	\begin{definition}
alpar@3	319	$G_{small}$ is an \textbf{induced subgraph} of $G_{large}$ if $\exists
alpar@3	320	I: V_{small}\longrightarrow V_{large}$ injection, for which the
alpar@3	321	following is true:
alpar@2	322	\begin{center}
alpar@3	323	$\forall u,v \in{V_{small}} : (u,v)\in{E_{small}} \Leftrightarrow
alpar@3	324	(I(u),I(v))\in E_{large}$
alpar@2	325	\end{center}
alpar@2	326	\end{definition}
alpar@2	327
alpar@2	328	\begin{definition}
alpar@3	329	$lab: (V_{small}\cup V_{large}) \longrightarrow K$ is a \textbf{node
alpar@3	330	label function}, where K is an arbitrary set. The elements in K
alpar@3	331	are the \textbf{node labels}. Two nodes, u and v are said to be
alpar@3	332	\textbf{equivalent}, if $lab(u)=lab(v)$.
alpar@2	333	\end{definition}
alpar@2	334
alpar@3	335	When node labels are also given, the matched nodes must have the same
alpar@3	336	labels. For example, the node labeled isomorphism is phrased by
alpar@2	337	\begin{definition}
alpar@3	338	$G_{small}$ and $G_{large}$ are \textbf{isomorphic by the node label
alpar@3	339	function lab} if $\exists M: V_{small} \longrightarrow V_{large}$
alpar@3	340	bijection, for which the following is true:
alpar@2	341	\begin{center}
alpar@3	342	$(\forall u,v\in{V_{small}} : (u,v)\in{E_{small}} \Leftrightarrow
alpar@3	343	(M(u),M(v))\in{E_{large}})$ and $(\forall u\in{V_{small}} :
alpar@3	344	lab(u)=lab(M(u)))$
alpar@2	345	\end{center}
alpar@2	346	\end{definition}
alpar@2	347
alpar@2	348	The other two definitions can be extended in the same way.
alpar@2	349
alpar@3	350	Note that edge label function can be defined similarly to node label
alpar@3	351	function, and all the definitions can be extended with additional
alpar@3	352	conditions, but it is out of the scope of this work.
alpar@2	353
alpar@3	354	The equivalence of two nodes is usually defined by another relation,
alpar@3	355	$\\R\subseteq (V_{small}\cup V_{large})^2$. This overlaps with the
alpar@3	356	definition given above if R is an equivalence relation, which does not
alpar@3	357	mean restriction in biological and chemical applications.
alpar@2	358
alpar@2	359	\subsection{Common problems}\label{sec:CommProb}
alpar@2	360
alpar@3	361	The focus of this paper is on two extensively studied topics, the
alpar@3	362	subgraph isomorphism and its variations. However, the following
alpar@3	363	problems also appear in many applications.
alpar@2	364
alpar@3	365	The \textbf{subgraph matching problem} is the following: is
alpar@3	366	$G_{small}$ isomorphic to any subgraph of $G_{large}$ by a given node
alpar@3	367	label?
alpar@2	368
alpar@3	369	The \textbf{induced subgraph matching problem} asks the same about the
alpar@3	370	existence of an induced subgraph.
alpar@2	371
alpar@3	372	The \textbf{graph isomorphism problem} can be defined as induced
alpar@3	373	subgraph matching problem where the sizes of the two graphs are equal.
alpar@2	374
alpar@3	375	In addition to existence, it may be needed to show such a subgraph, or
alpar@3	376	it may be necessary to list all of them.
alpar@2	377
alpar@3	378	It should be noted that some authors misleadingly refer to the term
alpar@3	379	\emph{subgraph isomorphism problem} as an \emph{induced subgraph
alpar@3	380	isomorphism problem}.
alpar@2	381
alpar@3	382	The following sections give the descriptions of VF2, VF2++, VF2 Plus
alpar@3	383	and a particular comparison.
alpar@2	384
alpar@2	385	\section{The VF2 Algorithm}
alpar@3	386	This algorithm is the basis of both the VF2++ and the VF2 Plus. VF2
alpar@4	387	is able to handle all the variations mentioned in Section
alpar@4	388	\ref{sec:CommProb}. Although it can also handle directed graphs,
alpar@3	389	for the sake of simplicity, only the undirected case will be
alpar@3	390	discussed.
alpar@2	391
alpar@2	392
alpar@2	393	\subsection{Common notations}
alpar@3	394	\indent Assume $G_{small}$ is searched in $G_{large}$. The following
alpar@3	395	definitions and notations will be used throughout the whole paper.
alpar@2	396	\begin{definition}
alpar@3	397	A set $M\subseteq V_{small}\times V_{large}$ is called
alpar@3	398	\textbf{mapping}, if no node of $V_{small}$ or of $V_{large}$ appears
alpar@3	399	in more than one pair in M. That is, M uniquely associates some of
alpar@3	400	the nodes in $V_{small}$ with some nodes of $V_{large}$ and vice
alpar@3	401	versa.
alpar@2	402	\end{definition}
alpar@2	403
alpar@2	404	\begin{definition}
alpar@3	405	Mapping M \textbf{covers} a node v, if there exists a pair in M, which
alpar@3	406	contains v.
alpar@2	407	\end{definition}
alpar@2	408
alpar@2	409	\begin{definition}
alpar@3	410	A mapping $M$ is $\mathbf{whole\ mapping}$, if $M$ covers all the
alpar@3	411	nodes in $V_{small}$.
alpar@2	412	\end{definition}
alpar@2	413
alpar@2	414	\begin{notation}
alpar@3	415	Let $\mathbf{M_{small}(s)} := \{u\in V_{small} : \exists v\in
alpar@3	416	V_{large}: (u,v)\in M(s)\}$ and $\mathbf{M_{large}(s)} := \{v\in
alpar@3	417	V_{large} : \exists u\in V_{small}: (u,v)\in M(s)\}$.
alpar@2	418	\end{notation}
alpar@2	419
alpar@2	420	\begin{notation}
alpar@3	421	Let $\mathbf{Pair(M,v)}$ be the pair of $v$ in $M$, if such a node
alpar@3	422	exist, otherwise $\mathbf{Pair(M,v)}$ is undefined. For a mapping $M$
alpar@3	423	and $v\in V_{small}\cup V_{large}$.
alpar@2	424	\end{notation}
alpar@2	425
alpar@2	426	Note that if $\mathbf{Pair(M,v)}$ exists, then it is unique
alpar@2	427
alpar@3	428	The definitions of the isomorphism types can be rephrased on the
alpar@3	429	existence of a special whole mapping $M$, since it represents a
alpar@3	430	bijection. For example
alpar@2	431	\begin{center}
alpar@3	432	$M\subseteq V_{small}\times V_{large}$ represents an induced subgraph
alpar@3	433	isomorphism $\Leftrightarrow$ $M$ is whole mapping and $\forall u,v
alpar@3	434	\in{V_{small}} : (u,v)\in{E_{small}} \Leftrightarrow
alpar@3	435	(Pair(M,u),Pair(M,v))\in E_{large}$.
alpar@2	436	\end{center}
alpar@2	437
alpar@2	438	\begin{definition}
alpar@2	439	A set of whole mappings is called \textbf{problem type}.
alpar@2	440	\end{definition}
alpar@3	441	Throughout the paper, $\mathbf{PT}$ denotes a generic problem type
alpar@3	442	which can be substituted by any problem type.
alpar@2	443
alpar@3	444	A whole mapping $W\mathbf{\ is\ of\ type\ PT}$, if $W\in PT$. Using
alpar@3	445	this notations, VF2 searches a whole mapping $W$ of type $PT$.
alpar@2	446
alpar@3	447	For example the problem type of graph isomorphism problem is the
alpar@3	448	following. A whole mapping $W$ is in $\mathbf{ISO}$, iff the
alpar@4	449	bijection represented by $W$ satisfies Definition~\ref{sec:ismorphic}.
alpar@4	450	The subgraph- and induced subgraph matching problems can be formalized
alpar@4	451	in a similar way. Let their problem types be denoted as $\mathbf{SUB}$
alpar@4	452	and $\mathbf{IND}$.
alpar@2	453
alpar@2	454	\begin{definition}
alpar@2	455	\label{expPT}
alpar@3	456	$PT$ is an \textbf{expanding problem type} if $\ \forall\ W\in
alpar@3	457	PT:\ \forall u_1,u_2\in V_{small}:\ (u_1,u_2)\in E_{small}\Rightarrow
alpar@3	458	(Pair(W,u_1),Pair(W,u_2))\in E_{large}$, that is each edge of
alpar@3	459	$G_{small}$ has to be mapped to an edge of $G_{large}$ for each
alpar@3	460	mapping in $PT$.
alpar@2	461	\end{definition}
alpar@2	462
alpar@2	463	Note that $ISO$, $SUB$ and $IND$ are expanding problem types.
alpar@2	464
alpar@3	465	This paper deals with the three problem types mentioned above only,
alpar@3	466	but the following generic definitions make it possible to handle other
alpar@3	467	types as well. Although it may be challenging to find a proper
alpar@3	468	consistency function and an efficient cutting function.
alpar@2	469
alpar@2	470	\begin{definition}
alpar@3	471	Let M be a mapping. A logical function $\mathbf{Cons_{PT}}$ is a
alpar@3	472	\textbf{consistency function by } $\mathbf{PT}$, if the following
alpar@3	473	holds. If there exists whole mapping $W$ of type PT for which
alpar@3	474	$M\subseteq W$, then $Cons_{PT}(M)$ is true.
alpar@2	475	\end{definition}
alpar@2	476
alpar@2	477	\begin{definition}
alpar@3	478	Let M be a mapping. A logical function $\mathbf{Cut_{PT}}$ is a
alpar@3	479	\textbf{cutting function by } $\mathbf{PT}$, if the following
alpar@3	480	holds. $\mathbf{Cut_{PT}(M)}$ is false if $M$ can be extended to a
alpar@3	481	whole mapping W of type PT.
alpar@2	482	\end{definition}
alpar@2	483
alpar@2	484	\begin{definition}
alpar@3	485	$M$ is said to be \textbf{consistent mapping by} $\mathbf{PT}$, if
alpar@3	486	$Cons_{PT}(M)$ is true.
alpar@2	487	\end{definition}
alpar@2	488
alpar@2	489	$Cons_{PT}$ and $Cut_{PT}$ will often be used in the following form.
alpar@2	490	\begin{notation}
alpar@3	491	Let $\mathbf{Cons_{PT}(p, M)}:=Cons_{PT}(M\cup\{p\})$ and
alpar@3	492	$\mathbf{Cut_{PT}(p, M)}:=Cut_{PT}(M\cup\{p\})$, where
alpar@3	493	$p\in{V_{small}\!\times\!V_{large}}$ and $M\cup\{p\}$ is mapping.
alpar@2	494	\end{notation}
alpar@2	495
alpar@3	496	$Cons_{PT}$ will be used to check the consistency of the already
alpar@3	497	covered nodes, while $Cut_{PT}$ is for looking ahead to recognize if
alpar@3	498	no whole consistent mapping can contain the current mapping.
alpar@2	499
alpar@2	500	\subsection{Overview of the algorithm}
alpar@3	501	VF2 uses a state space representation of mappings, $Cons_{PT}$ for
alpar@3	502	excluding inconsistency with the problem type and $Cut_{PT}$ for
alpar@3	503	pruning the search tree. Each state $s$ of the matching process can
alpar@3	504	be associated with a mapping $M(s)$.
alpar@2	505
alpar@4	506	Algorithm~\ref{alg:VF2Pseu} is a high level description of
alpar@3	507	the VF2 matching algorithm.
alpar@2	508
alpar@2	509
alpar@2	510	\begin{algorithm}
alpar@3	511	\algtext{EndIf}%ne nyomtasson end if-et \algtext{EndFor}%ne
alpar@3	512	nyomtasson .. \algtext*{EndProcedure}%ne nyomtasson ..
alpar@2	513	\caption{\hspace{0.5cm}$A\ high\ level\ description\ of\ VF2$}\label{alg:VF2Pseu}
alpar@2	514	\begin{algorithmic}[1]
alpar@2	515
alpar@3	516	\Procedure{VF2}{State $s$, ProblemType $PT$} \If{$M(s$) covers
alpar@3	517	$V_{small}$} \State Output($M(s)$) \Else
alpar@2	518
alpar@3	519	\State Compute the set $P(s)$ of the pairs candidate for inclusion
alpar@3	520	in $M(s)$ \ForAll{$p\in{P(s)}$} \If{Cons$_{PT}$($p, M(s)$) $\wedge$
alpar@3	521	$\neg$Cut$_{PT}$($p, M(s)$)} \State Compute the nascent state
alpar@3	522	$\tilde{s}$ by adding $p$ to $M(s)$ \State \textbf{call}
alpar@3	523	VF2($\tilde{s}$, $PT$) \EndIf \EndFor \EndIf \EndProcedure
alpar@2	524	\end{algorithmic}
alpar@2	525	\end{algorithm}
alpar@2	526
alpar@2	527
alpar@3	528	The initial state $s_0$ is associated with $M(s_0)=\emptyset$, i.e. it
alpar@3	529	starts with an empty mapping.
alpar@2	530
alpar@3	531	For each state $s$, the algorithm computes $P(s)$, the set of
alpar@3	532	candidate node pairs for adding to the current state $s$.
alpar@2	533
alpar@3	534	For each pair $p$ in $P(s)$, $Cons_{PT}(p,M(s))$ and
alpar@3	535	$Cut_{PT}(p,M(s))$ are evaluated. If $Cons_{PT}(p,M(s))$ is true and
alpar@3	536	$Cut_{PT}(p,M(s))$ is false, the successor state $\tilde{s}=s\cup
alpar@3	537	\{p\}$ is computed, and the whole process is recursively applied to
alpar@3	538	$\tilde{s}$. Otherwise, $\tilde{s}$ is not consistent by $PT$ or it
alpar@3	539	can be proved that $s$ can not be extended to a whole mapping.
alpar@2	540
alpar@2	541	In order to make sure of the correctness see
alpar@2	542	\begin{claim}
alpar@3	543	Through consistent mappings, only consistent whole mappings can be
alpar@3	544	reached, and all of the whole mappings are reachable through
alpar@3	545	consistent mappings.
alpar@2	546	\end{claim}
alpar@2	547
alpar@3	548	Note that a state may be reached in many different ways, since the
alpar@3	549	order of insertions into M does not influence the nascent mapping. In
alpar@3	550	fact, the number of different ways which lead to the same state can be
alpar@3	551	exponentially large. If $G_{small}$ and $G_{large}$ are circles with n
alpar@3	552	nodes and n different node labels, there exists exactly one graph
alpar@3	553	isomorphism between them, but it will be reached in $n!$ different
alpar@3	554	ways.
alpar@2	555
alpar@2	556	However, one may observe
alpar@2	557
alpar@2	558	\begin{claim}
alpar@2	559	\label{claim:claimTotOrd}
alpar@3	560	Let $\prec$ an arbitrary total ordering relation on $V_{small}$. If
alpar@3	561	the algorithm ignores each $p=(u,v) \in P(s)$, for which
alpar@2	562	\begin{center}
alpar@2	563	$\exists (\hat{u},\hat{v})\in P(s): \hat{u} \prec u$,
alpar@2	564	\end{center}
alpar@3	565	then no state can be reached more than ones and each state associated
alpar@3	566	with a whole mapping remains reachable.
alpar@2	567	\end{claim}
alpar@2	568
alpar@3	569	Note that the cornerstone of the improvements to VF2 is a proper
alpar@3	570	choice of a total ordering.
alpar@2	571
alpar@2	572	\subsection{The candidate set P(s)}
alpar@2	573	\label{candidateComputingVF2}
alpar@2	574	$P(s)$ is the set of the candidate pairs for inclusion in $M(s)$.
alpar@3	575	Suppose that $PT$ is an expanding problem type, see
alpar@4	576	Definition~\ref{expPT}.
alpar@2	577
alpar@2	578	\begin{notation}
alpar@3	579	Let $\mathbf{T_{small}(s)}:=\{u \in V_{small} : u$ is not covered by
alpar@3	580	$M(s)\wedge\exists \tilde{u}\in{V_{small}: (u,\tilde{u})\in E_{small}}
alpar@3	581	\wedge \tilde{u}$ is covered by $M(s)\}$, and
alpar@3	582	\\ $\mathbf{T_{large}(s)}\!:=\!\{v \in\!V_{large}\!:\!v$ is not
alpar@3	583	covered by
alpar@3	584	$M(s)\wedge\!\exists\tilde{v}\!\in\!{V_{large}\!:\!(v,\tilde{v})\in\!E_{large}}
alpar@3	585	\wedge \tilde{v}$ is covered by $M(s)\}$
alpar@2	586	\end{notation}
alpar@2	587
alpar@3	588	The set $P(s)$ includes the pairs of uncovered neighbours of covered
alpar@3	589	nodes and if there is not such a node pair, all the pairs containing
alpar@3	590	two uncovered nodes are added. Formally, let
alpar@2	591	\[
alpar@2	592	P(s)\!=\!
alpar@2	593	\begin{cases}
alpar@3	594	T_{small}(s)\times T_{large}(s)&\hspace{-0.15cm}\text{if }
alpar@3	595	T_{small}(s)\!\neq\!\emptyset\!\wedge\!T_{large}(s)\!\neq
alpar@3	596	\emptyset,\\ (V_{small}\!\setminus\!M_{small}(s))\!\times\!(V_{large}\!\setminus\!M_{large}(s))
alpar@3	597	&\hspace{-0.15cm}otherwise.
alpar@2	598	\end{cases}
alpar@2	599	\]
alpar@2	600
alpar@2	601	\subsection{Consistency}
alpar@3	602	This section defines the consistency functions for the different
alpar@4	603	problem types mentioned in Section~\ref{sec:CommProb}.
alpar@2	604	\begin{notation}
alpar@3	605	Let $\mathbf{\Gamma_{small} (u)}:=\{\tilde{u}\in V_{small} :
alpar@3	606	(u,\tilde{u})\in E_{small}\}$\\ Let $\mathbf{\Gamma_{large}
alpar@3	607	(v)}:=\{\tilde{v}\in V_{large} : (v,\tilde{v})\in E_{large}\}$
alpar@2	608	\end{notation}
alpar@3	609	Suppose $p=(u,v)$, where $u\in V_{small}$ and $v\in V_{large}$, $s$ is
alpar@3	610	a state of the matching procedure, $M(s)$ is consistent mapping by
alpar@3	611	$PT$ and $lab(u)=lab(v)$. $Cons_{PT}(p,M(s))$ checks whether
alpar@3	612	including pair $p$ into $M(s)$ leads to a consistent mapping by $PT$.
alpar@2	613
alpar@2	614	\subsubsection{Induced subgraph isomorphism}
alpar@3	615	$M(s)\cup \{(u,v)\}$ is a consistent mapping by $IND$ $\Leftrightarrow
alpar@3	616	(\forall \tilde{u}\in M_{small}: (u,\tilde{u})\in E_{small}
alpar@3	617	\Leftrightarrow (v,Pair(M(s),\tilde{u}))\in E_{large})$.\newline The
alpar@3	618	following formulation gives an efficient way of calculating
alpar@3	619	$Cons_{IND}$.
alpar@2	620	\begin{claim}
alpar@3	621	$Cons_{IND}((u,v),M(s)):=(\forall \tilde{v}\in \Gamma_{large}(v)
alpar@3	622	\ \cap\ M_{large}(s):\\(Pair(M(s),\tilde{v}),u)\in E_{small})\wedge
alpar@3	623	(\forall \tilde{u}\in \Gamma_{small}(u)
alpar@3	624	\ \cap\ M_{small}(s):(v,Pair(M(s),\tilde{u}))\in E_{large})$ is a
alpar@3	625	consistency function in the case of $IND$.
alpar@2	626	\end{claim}
alpar@2	627
alpar@2	628	\subsubsection{Graph isomorphism}
alpar@3	629	$M(s)\cup \{(u,v)\}$ is a consistent mapping by $ISO$
alpar@3	630	$\Leftrightarrow$ $M(s)\cup \{(u,v)\}$ is a consistent mapping by
alpar@3	631	$IND$.
alpar@2	632	\begin{claim}
alpar@3	633	$Cons_{ISO}((u,v),M(s))$ is a consistency function by $ISO$ if and
alpar@3	634	only if it is a consistency function by $IND$.
alpar@2	635	\end{claim}
alpar@2	636	\subsubsection{Subgraph isomorphism}
alpar@3	637	$M(s)\cup \{(u,v)\}$ is a consistent mapping by $SUB$ $\Leftrightarrow
alpar@3	638	(\forall \tilde{u}\in M_{small}:\\(u,\tilde{u})\in E_{small}
alpar@3	639	\Rightarrow (v,Pair(M(s),\tilde{u}))\in E_{large})$.
alpar@2	640	\newline
alpar@3	641	The following formulation gives an efficient way of calculating
alpar@3	642	$Cons_{SUB}$.
alpar@2	643	\begin{claim}
alpar@3	644	$Cons_{SUB}((u,v),M(s)):= (\forall \tilde{u}\in \Gamma_{small}(u)
alpar@3	645	\ \cap\ M_{small}(s):\\(v,Pair(M(s),\tilde{u}))\in E_{large})$ is a
alpar@3	646	consistency function by $SUB$.
alpar@2	647	\end{claim}
alpar@2	648
alpar@2	649	\subsection{Cutting rules}
alpar@3	650	$Cut_{PT}(p,M(s))$ is defined by a collection of efficiently
alpar@3	651	verifiable conditions. The requirement is that $Cut_{PT}(p,M(s))$ can
alpar@3	652	be true only if it is impossible to extended $M(s)\cup \{p\}$ to a
alpar@3	653	whole mapping.
alpar@2	654	\begin{notation}
alpar@2	655
alpar@3	656	Let $\mathbf{\tilde{T}_{small}}(s):=(V_{small}\backslash
alpar@3	657	M_{small}(s))\backslash T_{small}(s)$, and
alpar@3	658	\\ $\mathbf{\tilde{T}_{large}}(s):=(V_{large}\backslash
alpar@3	659	M_{large}(s))\backslash T_{large}(s)$.
alpar@2	660	\end{notation}
alpar@2	661	\subsubsection{Induced subgraph isomorphism}
alpar@2	662	\begin{claim}
alpar@3	663	$Cut_{IND}((u,v),M(s)):= \|\Gamma_{large} (v)\ \cap\ T_{large}(s)\| <
alpar@3	664	\|\Gamma_{small} (u)\ \cap\ T_{small}(s)\| \vee \|\Gamma_{large}(v)\cap
alpar@3	665	\tilde{T}_{large}(s)\| < \|\Gamma_{small}(u)\cap
alpar@3	666	\tilde{T}_{small}(s)\|$ is a cutting function by $IND$.
alpar@2	667	\end{claim}
alpar@2	668	\subsubsection{Graph isomorphism}
alpar@3	669	Note that the cutting function of induced subgraph isomorphism defined
alpar@3	670	above is a cutting function by $ISO$, too, however it is less
alpar@3	671	efficient than the following while their computational complexity is
alpar@3	672	the same.
alpar@2	673	\begin{claim}
alpar@3	674	$Cut_{ISO}((u,v),M(s)):= \|\Gamma_{large} (v)\ \cap\ T_{large}(s)\| \neq
alpar@3	675	\|\Gamma_{small} (u)\ \cap\ T_{small}(s)\| \vee \|\Gamma_{large}(v)\cap
alpar@3	676	\tilde{T}_{large}(s)\| \neq \|\Gamma_{small}(u)\cap
alpar@3	677	\tilde{T}_{small}(s)\|$ is a cutting function by $ISO$.
alpar@2	678	\end{claim}
alpar@2	679
alpar@2	680	\subsubsection{Subgraph isomorphism}
alpar@2	681	\begin{claim}
alpar@3	682	$Cut_{SUB}((u,v),M(s)):= \|\Gamma_{large} (v)\ \cap\ T_{large}(s)\| <
alpar@3	683	\|\Gamma_{small} (u)\ \cap\ T_{small}(s)\|$ is a cutting function by
alpar@3	684	$SUB$.
alpar@2	685	\end{claim}
alpar@3	686	Note that there is a significant difference between induced and
alpar@3	687	non-induced subgraph isomorphism:
alpar@2	688
alpar@2	689	\begin{claim}
alpar@2	690	\label{claimSUB}
alpar@3	691	$Cut_{SUB}'((u,v),M(s)):= \|\Gamma_{large} (v)\ \cap\ T_{large}(s)\| <
alpar@3	692	\|\Gamma_{small} (u)\ \cap\ T_{small}(s)\| \vee \|\Gamma_{large}(v)\cap
alpar@3	693	\tilde{T}_{large}(s)\| < \|\Gamma_{small}(u)\cap \tilde{T}_{small}(s)\|$
alpar@3	694	is \textbf{not} a cutting function by $SUB$.
alpar@2	695	\end{claim}
alpar@2	696	\begin{proof}$ $\\
alpar@2	697	\vspace*{-0.5cm}
alpar@2	698
alpar@2	699	\begin{figure}
alpar@2	700	\begin{center}
alpar@2	701	\begin{tikzpicture}
alpar@2	702	[scale=.8,auto=left,every node/.style={circle,fill=black!15}]
alpar@3	703	\node[rectangle,fill=black!15] at (4,6) {$G_{small}$}; \node (u4) at
alpar@3	704	(2.5,10) {$u_4$}; \node (u3) at (5.5,10) {$u_3$}; \node (u1) at
alpar@3	705	(2.5,7) {$u_1$}; \node (u2) at (5.5,7) {$u_2$};
alpar@2	706
alpar@2	707	\node[rectangle,fill=black!30] at (13.5,6) {$G_{large}$};
alpar@3	708	\node[fill=black!30] (v4) at (12,10) {$v_4$}; \node[fill=black!30]
alpar@3	709	(v3) at (15,10) {$v_3$}; \node[fill=black!30] (v1) at (12,7)
alpar@3	710	{$v_1$}; \node[fill=black!30] (v2) at (15,7) {$v_2$};
alpar@2	711
alpar@2	712
alpar@3	713	\foreach \from/\to in {u1/u2,u2/u3,u3/u4,u4/u1} \draw (\from) --
alpar@3	714	(\to); \foreach \from/\to in {v1/v2,v2/v3,v3/v4,v4/v1,v1/v3} \draw
alpar@3	715	(\from) -- (\to);
alpar@2	716	% \draw[dashed] (\from) -- (\to);
alpar@2	717	\end{tikzpicture}
alpar@4	718	\caption{Graphs for the proof of Claim~\ref{claimSUB}}
alpar@4	719	\label{fig:proofSUB}
alpar@2	720	\end{center}
alpar@2	721	\end{figure}
alpar@4	722	Let the two graphs of Figure~\ref{fig:proofSUB} be the input
alpar@3	723	graphs. Suppose the total ordering relation is $u_1 \prec u_2 \prec
alpar@3	724	u_3 \prec u_4$,$M(s)\!=\{(u_1,v_1)\}$, and VF2 tries to add
alpar@3	725	$(u_2,v_2)\in P(s)$.\newline $Cons_{SUB}((u_2,v_2),M(s))=true$, so
alpar@3	726	$M\cup \{(u_2,v_2)\}$ is consistent by $SUB$. The cutting function
alpar@3	727	$Cut_{SUB}((u_2,v_2),M(s))$ is false, so it does not let cut the
alpar@3	728	tree.\newline On the other hand $Cut_{SUB}'((u_2,v_2),M(s))$ is true,
alpar@3	729	since\\$0=\|\Gamma_{large}(v_2)\cap
alpar@3	730	\tilde{T}_{large}(s)\|<\|\Gamma_{small}(u_2)\cap
alpar@3	731	\tilde{T}_{small}(s)\|=1$ is true, but still the tree can not be
alpar@3	732	pruned, because otherwise the
alpar@3	733	$\{(u_1,v_1)(u_2,v_2)(u_3,v_3)(u_4,v_4)\}$ mapping can not be found.
alpar@2	734	\end{proof}
alpar@2	735
alpar@2	736	\section{The VF2++ Algorithm}
alpar@3	737	Although any total ordering relation makes the search space of VF2 a
alpar@3	738	tree, its choice turns out to dramatically influence the number of
alpar@3	739	visited states. The goal is to determine an efficient one as quickly
alpar@3	740	as possible.
alpar@2	741
alpar@3	742	The main reason for VF2++' superiority over VF2 is twofold. Firstly,
alpar@3	743	taking into account the structure and the node labeling of the graph,
alpar@3	744	VF2++ determines a state order in which most of the unfruitful
alpar@3	745	branches of the search space can be pruned immediately. Secondly,
alpar@3	746	introducing more efficient --- nevertheless still easier to compute
alpar@3	747	--- cutting rules reduces the chance of going astray even further.
alpar@2	748
alpar@3	749	In addition to the usual subgraph isomorphism, specialized versions
alpar@3	750	for induced subgraph isomorphism and for graph isomorphism have been
alpar@3	751	designed. VF2++ has gained a runtime improvement of one order of
alpar@3	752	magnitude respecting induced subgraph isomorphism and a better
alpar@3	753	asymptotical behaviour in the case of graph isomorphism problem.
alpar@2	754
alpar@3	755	Note that a weaker version of the cutting rules and the more efficient
alpar@3	756	candidate set calculating were described in \cite{VF2Plus}, too.
alpar@2	757
alpar@3	758	It should be noted that all the methods described in this section are
alpar@3	759	extendable to handle directed graphs and edge labels as well.
alpar@2	760
alpar@3	761	The basic ideas and the detailed description of VF2++ are provided in
alpar@3	762	the following.
alpar@2	763
alpar@2	764	\subsection{Preparations}
alpar@2	765	\begin{claim}
alpar@2	766	\label{claim:claimCoverFromLeft}
alpar@3	767	The total ordering relation uniquely determines a node order, in which
alpar@3	768	the nodes of $V_{small}$ will be covered by VF2. From the point of
alpar@3	769	view of the matching procedure, this means, that always the same node
alpar@3	770	of $G_{small}$ will be covered on the d-th level.
alpar@2	771	\end{claim}
alpar@2	772	\begin{proof}
alpar@3	773	In order to make the search space a tree, the pairs in $\{(u,v)\in
alpar@3	774	P(s) : \exists \hat{u} : \hat{u}\prec u\}$ are excluded from $P(s)$.
alpar@2	775	\newline
alpar@3	776	Let $\tilde{P}(s):=P(s)\backslash \{(u,v)\in P(s) : \exists \hat{u} :
alpar@3	777	\hat{u}\prec u\}$
alpar@2	778	\newline
alpar@3	779	The relation $\prec$ is a total ordering, so $\exists!\ \tilde{u} :
alpar@3	780	\forall\ (u,v)\in \tilde{P}(s): u=\tilde u$. Since a pair form
alpar@3	781	$\tilde{P}(s)$ is chosen for including into $M(s)$, it is obvious,
alpar@3	782	that only $\tilde{u}$ can be covered in $V_{small}$. Actually,
alpar@3	783	$\tilde{u}$ is the smallest element in $T_{small}(s)$ (or in
alpar@3	784	$V_{small}\backslash M_{small}(s)$, if $T_{small}(s)$ were empty), and
alpar@3	785	$T_{small}(s)$ depends only on the covered nodes of $G_{small}$.
alpar@2	786	\newline
alpar@3	787	Simple induction on $d$ shows that the set of covered nodes of
alpar@3	788	$G_{small}$ is unique, if $d$ is given, so $\tilde{u}$ is unique if
alpar@3	789	$d$ is given.
alpar@2	790	\end{proof}
alpar@2	791
alpar@2	792	\begin{definition}
alpar@3	793	An order $(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(\|V_{small}\|)})$ of
alpar@3	794	$V_{small}$ is \textbf{matching order}, if exists $\prec$ total
alpar@3	795	ordering relation, s.t. the VF2 with $\prec$ on the d-th level finds
alpar@3	796	pair for $u_{\sigma(d)}$ for all $d\in\{1,..,\|V_{small}\|\}$.
alpar@2	797	\end{definition}
alpar@2	798
alpar@2	799	\begin{claim}\label{claim:MOclaim}
alpar@3	800	A total ordering is matching order, iff the nodes of every component
alpar@3	801	form an interval in the node sequence, and every node connects to a
alpar@3	802	previous node in its component except the first node of the
alpar@3	803	component. The order of the components is arbitrary. \\Formally
alpar@3	804	spoken, an order
alpar@3	805	$(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(\|V_{small}\|)})$ of
alpar@3	806	$V_{small}$ is matching order $\Leftrightarrow$ $\forall
alpar@3	807	G'_{small}=(V'_{small},E'_{small})\ component\ of\ G_{small}: \forall
alpar@3	808	i: (\exists j : j<i\wedge u_{\sigma(j)},u_{\sigma(i)}\in
alpar@3	809	V'_{small})\Rightarrow \exists k : k < i \wedge (\forall l: k\leq
alpar@3	810	l\leq i \Rightarrow u_{l}\in V'_{small}) \wedge
alpar@3	811	(u_{\sigma{(k)}},u_{\sigma{(i)}})\in E'_{small}$, where $i,j,k,l\in
alpar@3	812	\{1,..,\|V_{small}\|\}$\newline
alpar@2	813	\end{claim}
alpar@2	814	\begin{proof}
alpar@3	815	Suppose a matching order is given. It has to be shown that the node
alpar@3	816	sequence has a structure described above.\\ Let
alpar@3	817	$G'_{small}=(V'_{small},E'_{small})$ be an arbitrary component and $i$
alpar@3	818	an arbitrary index.
alpar@2	819	\newline
alpar@3	820	$(\exists j : j<i\wedge u_{\sigma(j)},u_{\sigma(i)}\in
alpar@3	821	V'_{small})\Rightarrow u_{\sigma(i)}$ is not the first covered node in
alpar@3	822	$G_{small}\ \Rightarrow u_{\sigma(i)}$ is connected to a covered node
alpar@3	823	$u_{\sigma(k)}$ where $k<i$, since $u_{\sigma(i)}\in T_{small}(s)$ for
alpar@3	824	some $s$ and $T_{small}(s)\subseteq{V'_{small}}$ contains only nodes
alpar@3	825	connected to at least one covered node. It's easy to see, that
alpar@3	826	$\forall l: k\leq l\leq i \Rightarrow u_{l}\in V'_{small}$, since
alpar@3	827	$T_{small}(s)$ contains only nodes connected to some covered ones,
alpar@3	828	while it is not empty, but if it were empty, then $u_{\sigma(k)}$ and
alpar@3	829	$u_{\sigma(i)}$ were not in the same component.
alpar@2	830
alpar@3	831	Now, let us show that if a node sequence has the special structure
alpar@3	832	described above, it has to be matching
alpar@3	833	order.\\ $(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(\|V_{small}\|)})$ is
alpar@3	834	a total ordering, which determines the matching order
alpar@3	835	$(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(\|V_{small}\|)})$.
alpar@2	836	\end{proof}
alpar@2	837
alpar@3	838	To summing up, a total ordering always uniquely determines a matching
alpar@3	839	order, and every matching order can be determined by a total ordering,
alpar@3	840	however, more than one different total orderings may determine the
alpar@3	841	same matching order.
alpar@2	842	\subsection{Idea behind the algorithm}
alpar@3	843	The goal is to find a matching order in which the algorithm is able to
alpar@3	844	recognize inconsistency or prune the infeasible branches on the
alpar@3	845	highest levels and goes deep only if it is needed.
alpar@2	846
alpar@2	847	\begin{notation}
alpar@3	848	Let $\mathbf{Conn_{H}(u)}:=\|\Gamma_{small}(u)\cap H\}\|$, that is the
alpar@3	849	number of neighbours of u which are in H, where $u\in V_{small} $ and
alpar@3	850	$H\subseteq V_{small}$.
alpar@2	851	\end{notation}
alpar@2	852
alpar@3	853	The principal question is the following. Suppose a state $s$ is
alpar@3	854	given. For which node of $T_{small}(s)$ is the hardest to find a
alpar@3	855	consistent pair in $G_{large}$? The more covered neighbours a node in
alpar@3	856	$T_{small}(s)$ has --- i.e. the largest $Conn_{M_{small}(s)}$ it has
alpar@3	857	---, the more rarely satisfiable consistency constraints for its pair
alpar@3	858	are given.
alpar@2	859
alpar@3	860	In biology, most of the graphs are sparse, thus several nodes in
alpar@3	861	$T_{small}(s)$ may have the same $Conn_{M_{small}(s)}$, which makes
alpar@3	862	reasonable to define a secondary and a tertiary order between them.
alpar@3	863	The observation above proves itself to be as determining, that the
alpar@3	864	secondary ordering prefers nodes with the most uncovered neighbours
alpar@3	865	among which have the same $Conn_{M_{small}(s)}$ to increase
alpar@3	866	$Conn_{M_{small}(s)}$ of uncovered nodes so much, as possible. The
alpar@3	867	tertiary ordering prefers nodes having the rarest uncovered labels.
alpar@2	868
alpar@3	869	Note that the secondary ordering is the same as the ordering by $deg$,
alpar@3	870	which is a static data in front of the above used.
alpar@2	871
alpar@3	872	These rules can easily result in a matching order which contains the
alpar@3	873	nodes of a long path successively, whose nodes may have low $Conn$ and
alpar@3	874	is easily matchable into $G_{large}$. To avoid that, a BFS order is
alpar@3	875	used, which provides the shortest possible paths.
alpar@2	876	\newline
alpar@2	877
alpar@3	878	In the following, some examples on which the VF2 may be slow are
alpar@3	879	described, although they are easily solvable by using a proper
alpar@3	880	matching order.
alpar@2	881
alpar@2	882	\begin{example}
alpar@3	883	Suppose $G_{small}$ can be mapped into $G_{large}$ in many ways
alpar@3	884	without node labels. Let $u\in V_{small}$ and $v\in V_{large}$.
alpar@2	885	\newline
alpar@2	886	$lab(u):=black$
alpar@2	887	\newline
alpar@2	888	$lab(v):=black$
alpar@2	889	\newline
alpar@3	890	$lab(\tilde{u}):=red \ \forall \tilde{u}\in (V_{small}\backslash
alpar@3	891	\{u\})$
alpar@2	892	\newline
alpar@3	893	$lab(\tilde{v}):=red \ \forall \tilde{v}\in (V_{large}\backslash
alpar@3	894	\{v\})$
alpar@2	895	\newline
alpar@2	896
alpar@3	897	Now, any mapping by the node label $lab$ must contain $(u,v)$, since
alpar@3	898	$u$ is black and no node in $V_{large}$ has a black label except
alpar@3	899	$v$. If unfortunately $u$ were the last node which will get covered,
alpar@3	900	VF2 would check only in the last steps, whether $u$ can be matched to
alpar@3	901	$v$.
alpar@2	902	\newline
alpar@3	903	However, had $u$ been the first matched node, u would have been
alpar@3	904	matched immediately to v, so all the mappings would have been
alpar@3	905	precluded in which node labels can not correspond.
alpar@2	906	\end{example}
alpar@2	907
alpar@2	908	\begin{example}
alpar@3	909	Suppose there is no node label given, $G_{small}$ is a small graph and
alpar@3	910	can not be mapped into $G_{large}$ and $u\in V_{small}$.
alpar@2	911	\newline
alpar@3	912	Let $G'_{small}:=(V_{small}\cup
alpar@3	913	\{u'_{1},u'_{2},..,u'_{k}\},E_{small}\cup
alpar@3	914	\{(u,u'_{1}),(u'_{1},u'_{2}),..,(u'_{k-1},u'_{k})\})$, that is,
alpar@3	915	$G'_{small}$ is $G_{small}\cup \{ a\ k$ long path, which is disjoint
alpar@3	916	from $G_{small}$ and one of its starting points is connected to $u\in
alpar@3	917	V_{small}\}$.
alpar@2	918	\newline
alpar@3	919	Is there a subgraph of $G_{large}$, which is isomorph with
alpar@3	920	$G'_{small}$?
alpar@2	921	\newline
alpar@3	922	If unfortunately the nodes of the path were the first $k$ nodes in the
alpar@3	923	matching order, the algorithm would iterate through all the possible k
alpar@3	924	long paths in $G_{large}$, and it would recognize that no path can be
alpar@3	925	extended to $G'_{small}$.
alpar@2	926	\newline
alpar@3	927	However, had it started by the matching of $G_{small}$, it would not
alpar@3	928	have matched any nodes of the path.
alpar@2	929	\end{example}
alpar@2	930
alpar@3	931	These examples may look artificial, but the same problems also appear
Madarasi@7	932	in real-world instances, even though in a less obvious way.
alpar@2	933
alpar@2	934	\subsection{Total ordering}
alpar@3	935	Instead of the total ordering relation, the matching order will be
alpar@3	936	searched directly.
alpar@2	937	\begin{notation}
alpar@3	938	Let \textbf{F$_\mathcal{M}$(l)}$:=\|\{v\in V_{large} :
alpar@3	939	l=lab(v)\}\|-\|\{u\in V_{small}\backslash \mathcal{M} : l=lab(u)\}\|$ ,
alpar@3	940	where $l$ is a label and $\mathcal{M}\subseteq V_{small}$.
alpar@2	941	\end{notation}
alpar@2	942
alpar@2	943	\begin{definition}Let $\mathbf{arg\ max}_{f}(S) :=\{u : u\in S \wedge f(u)=max_{v\in S}\{f(v)\}\}$ and $\mathbf{arg\ min}_{f}(S) := arg\ max_{-f}(S)$, where $S$ is a finite set and $f:S\longrightarrow \mathbb{R}$.
alpar@2	944	\end{definition}
alpar@2	945
alpar@2	946	\begin{algorithm}
alpar@3	947	\algtext{EndIf}%ne nyomtasson end if-et \algtext{EndFor}%ne
alpar@3	948	nyomtasson .. \algtext*{EndProcedure}%ne nyomtasson ..
alpar@2	949	\algtext*{EndWhile}
alpar@2	950	\caption{\hspace{0.5cm}$The\ method\ of\ VF2++\ for\ determining\ the\ node\ order$}\label{alg:VF2PPPseu}
alpar@2	951	\begin{algorithmic}[1]
alpar@3	952	\Procedure{VF2++order}{} \State $\mathcal{M}$ := $\emptyset$
alpar@3	953	\Comment{matching order} \While{$V_{small}\backslash \mathcal{M}
alpar@3	954	\neq\emptyset$} \State $r\in$ arg max$_{deg}$ (arg
alpar@3	955	min$_{F_\mathcal{M}\circ lab}(V_{small}\backslash
alpar@3	956	\mathcal{M})$)\label{alg:findMin} \State Compute $T$, a BFS tree with
alpar@3	957	root node $r$. \For{$d=0,1,...,depth(T)$} \State $V_d$:=nodes of the
alpar@3	958	$d$-th level \State Process $V_d$ \Comment{See Algorithm
alpar@4	959	\ref{alg:VF2PPProcess1}) and \ref{alg:VF2PPProcess2}} \EndFor
alpar@3	960	\EndWhile \EndProcedure
alpar@2	961	\end{algorithmic}
alpar@2	962	\end{algorithm}
alpar@2	963
alpar@2	964	\begin{algorithm}
alpar@3	965	\algtext{EndIf}%ne nyomtasson end if-et \algtext{EndFor}%ne
alpar@3	966	nyomtasson .. \algtext*{EndProcedure}%ne nyomtasson ..
alpar@2	967	\algtext*{EndWhile}
alpar@2	968	\caption{\hspace{.5cm}$A\ method\ for\ processing\ a\ level\ of\ the\ BFS\ tree$}\label{alg:VF2PPProcess1}
alpar@2	969	\begin{algorithmic}[1]
alpar@3	970	\Procedure{VF2++ProcessLevel1}{$V_{d}$} \While{$V_d\neq\emptyset$}
alpar@3	971	\State $m\in$ arg min$_{F_\mathcal{M}\circ\ lab}($ arg max$_{deg}($arg
alpar@3	972	max$_{Conn_{\mathcal{M}}}(V_{d})))$ \State $V_d:=V_d\backslash m$
alpar@3	973	\State Append node $m$ to the end of $\mathcal{M}$ \State Refresh
alpar@3	974	$F_\mathcal{M}$ \EndWhile \EndProcedure
alpar@2	975	\end{algorithmic}
alpar@2	976	\end{algorithm}
alpar@2	977
alpar@2	978	\begin{algorithm}
alpar@3	979	\algtext{EndIf}%ne nyomtasson end if-et \algtext{EndFor}%ne
alpar@3	980	nyomtasson .. \algtext*{EndProcedure}%ne nyomtasson ..
alpar@2	981	\algtext*{EndWhile}
alpar@2	982	\caption{\hspace{0.5cm}$Another\ method\ for\ processing\ a\ level\ of\ the\ BFS\ tree$}\label{alg:VF2PPProcess2}
alpar@2	983	\begin{algorithmic}[1]
alpar@3	984	\Procedure{VF2++ProcessLevel2}{$V_{d}$} \State Sort $V_d$ in
alpar@3	985	descending lex. order by $(Conn_{\mathcal{M}},deg,-F_\mathcal{M})$
alpar@3	986	\State Append the sorted $V_d$ to the end of $\mathcal{M}$ \State
alpar@3	987	Refresh $F_\mathcal{M}$ \EndProcedure
alpar@2	988	\end{algorithmic}
alpar@2	989	\end{algorithm}
alpar@4	990	Algorithm~\ref{alg:VF2PPPseu} is a high level description of the
alpar@4	991	matching order procedure of VF2++. It computes a BFS tree for each
alpar@3	992	component in ascending order of their rarest $lab$ and largest $deg$,
alpar@4	993	whose root vertex is the component's minimal
alpar@4	994	node. Algorithms~\ref{alg:VF2PPProcess1} and \ref{alg:VF2PPProcess2}
alpar@4	995	are two different methods to process a level of the BFS tree.
alpar@2	996
alpar@3	997	After sorting the nodes of the current level in descending
alpar@3	998	lexicographic order by $(Conn_{\mathcal{M}},deg,-F_\mathcal{M})$,
alpar@4	999	Algorithm~\ref{alg:VF2PPProcess2} appends them simultaneously to the
alpar@4	1000	matching order $\mathcal{M}$ and refreshes $F_\mathcal{M}$ just once,
alpar@4	1001	whereas Algorithm~\ref{alg:VF2PPProcess1} appends the nodes separately
alpar@4	1002	to $\mathcal{M}$ and refreshes $F_\mathcal{M}$ immediately, so it
alpar@3	1003	provides up-to-date label information and may result in a more
alpar@3	1004	efficient matching order.
alpar@2	1005
alpar@4	1006	Claim~\ref{claim:MOclaim} shows that Algorithm~\ref{alg:VF2PPPseu}
alpar@4	1007	provides a matching order.
alpar@2	1008
alpar@2	1009
alpar@2	1010	\subsection{Cutting rules}
alpar@2	1011	\label{VF2PPCuttingRules}
alpar@3	1012	This section presents the cutting rules of VF2++, which are improved
alpar@3	1013	by using extra information coming from the node labels.
alpar@2	1014	\begin{notation}
alpar@3	1015	Let $\mathbf{\Gamma_{small}^{l}(u)}:=\{\tilde{u} : lab(\tilde{u})=l
alpar@3	1016	\wedge \tilde{u}\in \Gamma_{small} (u)\}$ and
alpar@3	1017	$\mathbf{\Gamma_{large}^{l}(v)}:=\{\tilde{v} : lab(\tilde{v})=l \wedge
alpar@3	1018	\tilde{v}\in \Gamma_{large} (v)\}$, where $u\in V_{small}$, $v\in
alpar@3	1019	V_{large}$ and $l$ is a label.
alpar@2	1020	\end{notation}
alpar@2	1021
alpar@2	1022	\subsubsection{Induced subgraph isomorphism}
alpar@2	1023	\begin{claim}
alpar@2	1024	\[LabCut_{IND}((u,v),M(s))\!:=\!\!\!\!\!\bigvee_{l\ is\ label}\!\!\!\!\!\!\!\|\Gamma_{large}^{l} (v) \cap T_{large}(s)\|\!<\!\|\Gamma_{small}^{l}(u)\cap T_{small}(s)\|\ \vee\]\[\bigvee_{l\ is\ label} \newline \|\Gamma_{large}^{l}(v)\cap \tilde{T}_{large}(s)\| < \|\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)\|\] is a cutting function by IND.
alpar@2	1025	\end{claim}
alpar@2	1026	\begin{proof}
alpar@3	1027	It has to be shown, that $LabCut_{IND}((u,v),M(s))=true\Rightarrow$
alpar@3	1028	the mapping can not be extended to a whole
alpar@3	1029	mapping.\\ $LabCut_{IND}((u,v),M(s))=true,$ iff the following
alpar@3	1030	holds. $\\\exists l: \|\Gamma_{large}^{l} (v)\ \cap\ T_{large}(s)\| <
alpar@3	1031	\|\Gamma_{small}^{l} (u)\ \cap\ T_{small}(s)\| \vee
alpar@3	1032	\|\Gamma_{large}^{l}(v)\cap \tilde{T}_{large}(s)\| <
alpar@3	1033	\|\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)\|$.
alpar@2	1034
alpar@3	1035	Suppose that $\|\Gamma_{large}^{l} (v)\ \cap\ T_{large}(s)\| <
alpar@3	1036	\|\Gamma_{small}^{l} (u)\ \cap\ T_{small}(s)\|$. Each node of
alpar@3	1037	$\Gamma_{small}^{l} (u)\ \cap\ T_{small}(s)$ has to be matched to a
alpar@3	1038	node in $\Gamma_{large}^{l} (v)\ \cap\ T_{large}(s)$, so
alpar@3	1039	$\Gamma_{large}^{l} (v)\ \cap\ T_{large}(s)$ can not be smaller than
alpar@3	1040	$\Gamma_{small}^{l} (u)\ \cap\ T_{small}(s)$. That is why $M(s)$ can
alpar@3	1041	not be extended to a whole mapping.
alpar@2	1042
alpar@3	1043	Otherwise $\|\Gamma_{large}^{l}(v)\cap \tilde{T}_{large}(s)\| <
alpar@3	1044	\|\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)\|$ has to be
alpar@3	1045	true. Similarly, each node of $\Gamma_{large}^{l}(v)\cap
alpar@3	1046	\tilde{T}_{large}(s)$ has to be matched to a node in
alpar@3	1047	$\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)$,
alpar@3	1048	i.e. $\Gamma_{large}^{l}(v)\cap \tilde{T}_{large}(s)$ can not be
alpar@3	1049	smaller than $\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)$, if
alpar@3	1050	$M(s)$ is extendible.
alpar@2	1051	\end{proof}
alpar@2	1052	The following claims can be proven similarly.
alpar@2	1053	\subsubsection{Graph isomorphism}
alpar@2	1054	\begin{claim}
alpar@2	1055	\[LabCut_{ISO}((u,v),M(s))\!:=\!\!\!\!\!\bigvee_{l\ is\ label}\!\!\!\!\!\!\!\|\Gamma_{large}^{l} (v) \cap T_{large}(s)\|\!\neq\!\|\Gamma_{small}^{l}(u)\cap T_{small}(s)\|\ \vee\]\[\bigvee_{l\ is\ label} \newline \|\Gamma_{large}^{l}(v)\cap \tilde{T}_{large}(s)\| \neq \|\Gamma_{small}^{l}(u)\cap \tilde{T}_{small}(s)\|\] is a cutting function by ISO.
alpar@2	1056	\end{claim}
alpar@2	1057
alpar@2	1058	\subsubsection{Subgraph isomorphism}
alpar@2	1059	\begin{claim}
Madarasi@7	1060	\[LabCut_{SUB}((u,v),M(s))\!:=\!\!\!\!\!\bigvee_{l\ is\ label}\!\!\!\!\!\!\!\|\Gamma_{large}^{l} (v) \cap T_{large}(s)\|\!<\!\|\Gamma_{small}^{l}(u)\cap T_{small}(s)\|\] is a cutting function by SUB.
alpar@2	1061	\end{claim}
alpar@2	1062
alpar@2	1063
alpar@2	1064
alpar@2	1065	\subsection{Implementation details}
alpar@3	1066	This section provides a detailed summary of an efficient
alpar@3	1067	implementation of VF2++.
alpar@2	1068	\subsubsection{Storing a mapping}
alpar@3	1069	After fixing an arbitrary node order ($u_0, u_1, ..,
alpar@3	1070	u_{\|G_{small}\|-1}$) of $G_{small}$, an array $M$ is usable to store
alpar@3	1071	the current mapping in the following way.
alpar@2	1072	\[
alpar@3	1073	M[i] =
alpar@2	1074	\begin{cases}
alpar@3	1075	v & if\ (u_i,v)\ is\ in\ the\ mapping\\ INVALID &
alpar@3	1076	if\ no\ node\ has\ been\ mapped\ to\ u_i.
alpar@2	1077	\end{cases}
alpar@2	1078	\]
alpar@3	1079	Where $i\in\{0,1, ..,\|G_{small}\|-1\}$, $v\in V_{large}$ and $INVALID$
alpar@3	1080	means "no node".
alpar@2	1081	\subsubsection{Avoiding the recurrence}
alpar@3	1082	Exploring the state space was described in a recursive fashion using
alpar@4	1083	sets (see Algorithm~\ref{alg:VF2Pseu}), which makes the
alpar@3	1084	algorithm easy to understand but does not show directly an efficient
alpar@3	1085	way of implementation. The following approach avoiding recursion and
alpar@3	1086	using lookup tables instead of sets is not only fast but has linear
alpar@3	1087	space complexity as well.
alpar@2	1088
alpar@4	1089	The recursion of Algorithm~\ref{alg:VF2Pseu} can be realized
alpar@3	1090	as a while loop, which has a loop counter $depth$ denoting the
alpar@3	1091	all-time depth of the recursion. Fixing a matching order, let $M$
alpar@3	1092	denote the array storing the all-time mapping. The initial state is
alpar@3	1093	associated with the empty mapping, which means that $\forall i:
alpar@3	1094	M[i]=INVALID$ and $depth=0$. In case of a recursive call, $depth$ has
alpar@3	1095	to be incremented, while in case of a return, it has to be
alpar@4	1096	decremented. Based on Claim~\ref{claim:claimCoverFromLeft},
alpar@3	1097	$M$ is $INVALID$ from index $depth$+1 and not $INVALID$ before
alpar@3	1098	$depth$, i.e. $\forall i: i < depth \Rightarrow M[i]\neq INVALID$ and
alpar@3	1099	$\forall i: i > depth \Rightarrow M[i]= INVALID$. $M[depth]$ changes
alpar@3	1100	while the state is being processed, but the property is held before
alpar@3	1101	both stepping back to a predecessor state and exploring a successor
alpar@3	1102	state.
alpar@2	1103
alpar@3	1104	The necessary part of the candidate set is easily maintainable or
alpar@3	1105	computable by following
alpar@4	1106	Section~\ref{candidateComputingVF2}. A much faster method
alpar@3	1107	has been designed for biological- and sparse graphs, see the next
alpar@3	1108	section for details.
alpar@2	1109
alpar@2	1110	\subsubsection{Calculating the candidates for a node}
alpar@4	1111	Being aware of Claim~\ref{claim:claimCoverFromLeft}, the
alpar@3	1112	task is not to maintain the candidate set, but to generate the
alpar@3	1113	candidate nodes in $G_{large}$ for a given node $u\in V_{small}$. In
alpar@3	1114	case of an expanding problem type and $M$ mapping, if a node $v\in
alpar@3	1115	V_{large}$ is a potential pair of $u\in V_{small}$, then $\forall
alpar@3	1116	u'\in V_{small} : (u,u')\in
alpar@3	1117	E_{small}\ and\ u'\ is\ covered\ by\ M\ \Rightarrow (v,Pair(M,u'))\in
alpar@3	1118	E_{large}$. That is, each covered neighbour of $u$ has to be mapped to
alpar@3	1119	a covered neighbour of $v$.
alpar@2	1120
alpar@3	1121	Having said that, an algorithm running in $\Theta(deg)$ time is
alpar@3	1122	describable if there exists a covered node in the component containing
alpar@3	1123	$u$. In this case choose a covered neighbour $u'$ of $u$ arbitrarily
alpar@3	1124	--- such a node exists based on
alpar@4	1125	Claim~\ref{claim:MOclaim}. With all the candidates of $u$
alpar@3	1126	being among the uncovered neighbours of $Pair(M,u')$, there are solely
alpar@3	1127	$deg(Pair(M,u'))$ nodes to check.
alpar@2	1128
alpar@3	1129	An easy trick is to choose an $u'$, for which
alpar@3	1130	$\|\{uncovered\ neighbours\ $ $of\ Pair(M,u')\}\|$ is the smallest
alpar@3	1131	possible.
alpar@2	1132
alpar@3	1133	Note that if $u$ is the first node of its component, then all the
alpar@3	1134	uncovered nodes of $G_{large}$ are candidates, so giving a sublinear
alpar@3	1135	method is impossible.
alpar@2	1136
alpar@2	1137
alpar@2	1138	\subsubsection{Determining the node order}
alpar@3	1139	This section describes how the node order preprocessing method of
alpar@3	1140	VF2++ can efficiently be implemented.
alpar@2	1141
alpar@3	1142	For using lookup tables, the node labels are associated with the
alpar@3	1143	numbers $\{0,1,..,\|K\|-1\}$, where $K$ is the set of the labels. It
alpar@3	1144	enables $F_\mathcal{M}$ to be stored in an array, for which
alpar@3	1145	$F_\mathcal{M}[i]=F_\mathcal{M}(i)$ where $i=0,1,..,\|K\|-1$. At first,
alpar@3	1146	$\mathcal{M}=\emptyset$, so $F_\mathcal{M}[i]$ is the number of nodes
alpar@3	1147	in $V_{small}$ having label i, which is easy to compute in
alpar@3	1148	$\Theta(\|V_{small}\|)$ steps.
alpar@2	1149
alpar@3	1150	$\mathcal{M}\subseteq V_{small}$ can be represented as an array of
alpar@3	1151	size $\|V_{small}\|$.
alpar@2	1152
alpar@3	1153	The BFS tree is computed by using a FIFO data structure which is
alpar@3	1154	usually implemented as a linked list, but one can avoid it by using
alpar@3	1155	the array $\mathcal{M}$ itself. $\mathcal{M}$ contains all the nodes
alpar@3	1156	seen before, a pointer shows where the first node of the FIFO is, and
alpar@3	1157	another one shows where the next explored node has to be inserted. So
alpar@3	1158	the nodes of each level of the BFS tree can be processed by
alpar@4	1159	Algorithms~\ref{alg:VF2PPProcess1} and
alpar@4	1160	\ref{alg:VF2PPProcess2} in place by swapping nodes.
alpar@2	1161
alpar@3	1162	After a node $u$ gets to the next place of the node order,
alpar@3	1163	$F_\mathcal{M}[lab[u]]$ has to be decreased by one, because there is
alpar@3	1164	one less covered node in $V_{large}$ with label $lab(u)$, that is why
alpar@3	1165	min selection sort is preferred which gives the elements from left to
alpar@4	1166	right in descending order, see Algorithm~\ref{alg:VF2PPProcess1}.
alpar@2	1167
alpar@3	1168	Note that using a $\Theta(n^2)$ sort absolutely does not slow down the
alpar@3	1169	procedure on biological (and on sparse) graphs, since they have few
alpar@3	1170	nodes on a level. If a level had a large number of nodes,
alpar@4	1171	Algorithm~\ref{alg:VF2PPProcess2} would seem to be a better
alpar@3	1172	choice with a $\Theta(nlog(n))$ or Bucket sort, but it may reduce the
alpar@3	1173	efficiency of the matching procedure, since $F_\mathcal{M}(i)$ can not
alpar@3	1174	be immediately refreshed, so it is unable to provide up-to-date label
alpar@3	1175	information.
alpar@2	1176
alpar@4	1177	Note that the \textit{while loop} of Algorithm~\ref{alg:VF2PPPseu}
alpar@4	1178	takes one iteration per graph component and the graphs in biology are
alpar@4	1179	mostly connected.
alpar@4	1180
alpar@2	1181	\subsubsection{Cutting rules}
alpar@4	1182	In Section~\ref{VF2PPCuttingRules}, the cutting rules were
alpar@3	1183	described using the sets $T_{small}$, $T_{large}$, $\tilde T_{small}$
alpar@3	1184	and $\tilde T_{large}$, which are dependent on the all-time mapping
alpar@3	1185	(i.e. on the all-time state). The aim is to check the labeled cutting
alpar@3	1186	rules of VF2++ in $\Theta(deg)$ time.
alpar@2	1187
alpar@3	1188	Firstly, suppose that these four sets are given in such a way, that
alpar@3	1189	checking whether a node is in a certain set takes constant time,
alpar@3	1190	e.g. they are given by their 0-1 characteristic vectors. Let $L$ be an
alpar@3	1191	initially zero integer lookup table of size $\|K\|$. After incrementing
alpar@3	1192	$L[lab(u')]$ for all $u'\in \Gamma_{small}(u) \cap T_{small}(s)$ and
alpar@3	1193	decrementing $L[lab(v')]$ for all $v'\in\Gamma_{large} (v) \cap
alpar@3	1194	T_{large}(s)$, the first part of the cutting rules is checkable in
alpar@3	1195	$\Theta(deg)$ time by considering the proper signs of $L$. Setting $L$
alpar@3	1196	to zero takes $\Theta(deg)$ time again, which makes it possible to use
alpar@3	1197	the same table through the whole algorithm. The second part of the
alpar@3	1198	cutting rules can be verified using the same method with $\tilde
alpar@3	1199	T_{small}$ and $\tilde T_{large}$ instead of $T_{small}$ and
alpar@3	1200	$T_{large}$. Thus, the overall complexity is $\Theta(deg)$.
alpar@2	1201
alpar@3	1202	An other integer lookup table storing the number of covered neighbours
alpar@3	1203	of each node in $G_{large}$ gives all the information about the sets
alpar@3	1204	$T_{large}$ and $\tilde T_{large}$, which is maintainable in
alpar@3	1205	$\Theta(deg)$ time when a pair is added or substracted by incrementing
alpar@3	1206	or decrementing the proper indices. A further improvement is that the
alpar@3	1207	values of $L[lab(u')]$ in case of checking $u$ is dependent only on
alpar@3	1208	$u$, i.e. on the size of the mapping, so for each $u\in V_{small}$ an
alpar@3	1209	array of pairs (label, number of such labels) can be stored to skip
alpar@3	1210	the maintaining operations. Note that these arrays are at most of size
alpar@3	1211	$deg$. Skipping this trick, the number of covered neighbours has to be
alpar@3	1212	stored for each node of $G_{small}$ as well to get the sets
alpar@3	1213	$T_{small}$ and $\tilde T_{small}$.
alpar@2	1214
alpar@3	1215	Using similar tricks, the consistency function can be evaluated in
alpar@3	1216	$\Theta(deg)$ steps, as well.
alpar@2	1217
alpar@2	1218	\section{The VF2 Plus Algorithm}
alpar@3	1219	The VF2 Plus algorithm is a recently improved version of VF2. It was
alpar@3	1220	compared with the state of the art algorithms in \cite{VF2Plus} and
alpar@3	1221	has proven itself to be competitive with RI, the best algorithm on
alpar@3	1222	biological graphs. \\ A short summary of VF2 Plus follows, which uses
alpar@3	1223	the notation and the conventions of the original paper.
alpar@2	1224
alpar@2	1225	\subsection{Ordering procedure}
alpar@3	1226	VF2 Plus uses a sorting procedure that prefers nodes in $V_{small}$
alpar@3	1227	with the lowest probability to find a pair in $V_{small}$ and the
alpar@3	1228	highest number of connections with the nodes already sorted by the
alpar@3	1229	algorithm.
alpar@2	1230
alpar@2	1231	\begin{definition}
alpar@3	1232	$(u,v)$ is a \textbf{feasible pair}, if $lab(u)=lab(v)$ and
alpar@3	1233	$deg(u)\leq deg(v)$, where $u\in{V_{small}}$ and $ v\in{V_{large}}$.
alpar@2	1234	\end{definition}
alpar@3	1235	$P_{lab}(L):=$ a priori probability to find a node with label $L$ in
alpar@3	1236	$V_{large}$
alpar@2	1237	\newline
alpar@3	1238	$P_{deg}(d):=$ a priori probability to find a node with degree $d$ in
alpar@3	1239	$V_{large}$
alpar@2	1240	\newline
alpar@3	1241	$P(u):=P_{lab}(L)*\bigcup_{d'>d}P_{deg}(d')$\\ $M$ is the set of
alpar@3	1242	already sorted nodes, $T$ is the set of nodes candidate to be
alpar@3	1243	selected, and $degreeM$ of a node is the number of its neighbours in
alpar@3	1244	$M$.
alpar@2	1245	\begin{algorithm}
alpar@3	1246	\algtext{EndIf}%ne nyomtasson end if-et \algtext{EndFor}%ne
alpar@3	1247	nyomtasson .. \algtext*{EndProcedure}%ne nyomtasson ..
alpar@2	1248	\algtext*{EndWhile}
alpar@2	1249	\caption{}\label{alg:VF2PlusPseu}
alpar@2	1250	\begin{algorithmic}[1]
alpar@3	1251	\Procedure{VF2 Plus order}{} \State Select the node with the lowest
alpar@3	1252	$P$. \If {more nodes share the same $P$} \State select the one with
alpar@3	1253	maximum degree \EndIf \If {more nodes share the same $P$ and have the
alpar@3	1254	max degree} \State select the first \EndIf \State Put the selected
alpar@3	1255	node in the set $M$. \label{alg:putIn} \State Put all its unsorted
alpar@3	1256	neighbours in the set $T$. \If {$M\neq V_{small}$} \State From set
alpar@3	1257	$T$ select the node with maximum $degreeM$. \If {more nodes have
alpar@3	1258	maximum $degreeM$} \State Select the one with the lowest $P$ \EndIf
alpar@3	1259	\If {more nodes have maximum $degreeM$ and $P$} \State Select the
alpar@3	1260	first. \EndIf \State \textbf{goto \ref{alg:putIn}.} \EndIf
alpar@2	1261	\EndProcedure
alpar@2	1262	\end{algorithmic}
alpar@2	1263	\end{algorithm}
alpar@2	1264
alpar@4	1265	Using these notations, Algorithm~\ref{alg:VF2PlusPseu}
alpar@3	1266	provides the description of the sorting procedure.
alpar@2	1267
alpar@3	1268	Note that $P(u)$ is not the exact probability of finding a consistent
alpar@3	1269	pair for $u$ by choosing a node of $V_{large}$ randomly, since
alpar@3	1270	$P_{lab}$ and $P_{deg}$ are not independent, though calculating the
alpar@3	1271	real probability would take quadratic time, which may be reduced by
alpar@3	1272	using fittingly lookup tables.
alpar@2	1273
alpar@2	1274	\section{Experimental results}
alpar@3	1275	This section compares the performance of VF2++ and VF2 Plus. Both
alpar@3	1276	algorithms have run faster with orders of magnitude than VF2, thus its
alpar@3	1277	inclusion was not reasonable.
alpar@2	1278	\subsection{Biological graphs}
alpar@3	1279	The tests have been executed on a recent biological dataset created
alpar@3	1280	for the International Contest on Pattern Search in Biological
Madarasi@7	1281	Databases\cite{Content}, which has been constructed of molecule,
Madarasi@7	1282	protein and contact map graphs extracted from the Protein Data
alpar@3	1283	Bank\cite{ProteinDataBank}.
alpar@2	1284
alpar@3	1285	The molecule dataset contains small graphs with less than 100 nodes
alpar@3	1286	and an average degree of less than 3. The protein dataset contains
alpar@3	1287	graphs having 500-10 000 nodes and an average degree of 4, while the
alpar@3	1288	contact map dataset contains graphs with 150-800 nodes and an average
alpar@3	1289	degree of 20. \\
alpar@2	1290
alpar@3	1291	In the following, the induced subgraph isomorphism and the graph
alpar@3	1292	isomorphism will be examined.
alpar@2	1293
Madarasi@7	1294	This dataset provides graph pairs, between which all the induced subgraph isomorphisms have to be found. For run time results, please see Figure~\ref{fig:bioIND}.
Madarasi@7	1295
Madarasi@7	1296	In an other experiment, the nodes of each graph in the database had been
Madarasi@7	1297	shuffled, and an isomorphism between the shuffled and the original
Madarasi@7	1298	graph was searched. The solution times are shown on Figure~\ref{fig:bioISO}.
Madarasi@7	1299
Madarasi@7	1300
alpar@2	1301
alpar@2	1302	\begin{figure}[H]
Madarasi@7	1303	\vspace*{-2cm}
Madarasi@7	1304	\hspace*{-1.5cm}
Madarasi@7	1305	\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7	1306	\begin{figure}[H]
Madarasi@7	1307	\begin{tikzpicture}[trim axis left, trim axis right]
Madarasi@7	1308	\begin{axis}[title=Molecules ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
Madarasi@7	1309	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
Madarasi@7	1310	west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7	1311	format/1000 sep = \thinspace}]
Madarasi@7	1312	%\addplot+[only marks] table {proteinsOrig.txt};
Madarasi@7	1313	\addplot table {Orig/moleculesIso.txt}; \addplot[mark=triangle*,mark
Madarasi@7	1314	size=1.8pt,color=red] table {VF2PPLabel/moleculesIso.txt};
Madarasi@7	1315	\end{axis}
Madarasi@7	1316	\end{tikzpicture}
Madarasi@7	1317	\caption{In the case of molecules, there is not such a significant
Madarasi@7	1318	difference, but VF2++ seems to be faster as the number of nodes
Madarasi@7	1319	increases.}\label{fig:ISOMolecule}
Madarasi@7	1320	\end{figure}
Madarasi@7	1321	\end{subfigure}
Madarasi@7	1322	\hspace*{1.5cm}
Madarasi@7	1323	\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7	1324	\begin{figure}[H]
Madarasi@7	1325	\begin{tikzpicture}[trim axis left, trim axis right]
Madarasi@7	1326	\begin{axis}[title=Contact maps ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
Madarasi@7	1327	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
Madarasi@7	1328	west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7	1329	format/1000 sep = \thinspace}]
Madarasi@7	1330	%\addplot+[only marks] table {proteinsOrig.txt};
Madarasi@7	1331	\addplot table {Orig/contactMapsIso.txt}; \addplot[mark=triangle*,mark
Madarasi@7	1332	size=1.8pt,color=red] table {VF2PPLabel/contactMapsIso.txt};
Madarasi@7	1333	\end{axis}
Madarasi@7	1334	\end{tikzpicture}
Madarasi@7	1335	\caption{The results are closer to each other on contact maps, but
Madarasi@7	1336	VF2++ still performs consistently better.}\label{fig:ISOContact}
Madarasi@7	1337	\end{figure}
Madarasi@7	1338	\end{subfigure}
Madarasi@7	1339
alpar@2	1340	\begin{center}
Madarasi@7	1341	\vspace*{-0.5cm}
Madarasi@7	1342	\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7	1343	\begin{figure}[H]
Madarasi@7	1344	\begin{tikzpicture}[trim axis left, trim axis right]
Madarasi@7	1345	\begin{axis}[title=Proteins ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
Madarasi@7	1346	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
Madarasi@7	1347	west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7	1348	format/1000 sep = \thinspace}]
Madarasi@7	1349	%\addplot+[only marks] table {proteinsOrig.txt};
Madarasi@7	1350	\addplot table {Orig/proteinsIso.txt}; \addplot[mark=triangle*,mark
Madarasi@7	1351	size=1.8pt,color=red] table {VF2PPLabel/proteinsIso.txt};
Madarasi@7	1352	\end{axis}
Madarasi@7	1353	\end{tikzpicture}
Madarasi@7	1354	\caption{On protein graphs, VF2 Plus has a super linear time
Madarasi@7	1355	complexity, while VF2++ runs in near constant time. The difference
Madarasi@7	1356	is about two order of magnitude on large graphs.}\label{fig:ISOProt}
Madarasi@7	1357	\end{figure}
Madarasi@7	1358	\end{subfigure}
Madarasi@7	1359	\end{center}
Madarasi@7	1360	\vspace*{-0.6cm}
Madarasi@7	1361	\caption{\normalsize{Graph isomomorphism on biological graphs}}\label{fig:bioISO}
Madarasi@7	1362	\end{figure}
Madarasi@7	1363
Madarasi@7	1364
Madarasi@7	1365	\begin{figure}[H]
Madarasi@7	1366	\vspace*{-2cm}
Madarasi@7	1367	\hspace*{-1.5cm}
Madarasi@7	1368	\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7	1369	\begin{figure}[H]
Madarasi@7	1370	\begin{tikzpicture}[trim axis left, trim axis right]
Madarasi@7	1371	\begin{axis}[title=Molecules IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
Madarasi@7	1372	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
Madarasi@7	1373	west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7	1374	format/1000 sep = \thinspace}]
Madarasi@7	1375	%\addplot+[only marks] table {proteinsOrig.txt};
Madarasi@7	1376	\addplot table {Orig/Molecules.32.txt}; \addplot[mark=triangle*,mark
Madarasi@7	1377	size=1.8pt,color=red] table {VF2PPLabel/Molecules.32.txt};
Madarasi@7	1378	\end{axis}
Madarasi@7	1379	\end{tikzpicture}
Madarasi@7	1380	\caption{In the case of molecules, the algorithms have
Madarasi@7	1381	similar behaviour, but VF2++ is almost two times faster even on such
Madarasi@7	1382	small graphs.} \label{fig:INDMolecule}
Madarasi@7	1383	\end{figure}
Madarasi@7	1384	\end{subfigure}
Madarasi@7	1385	\hspace*{1.5cm}
Madarasi@7	1386	\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7	1387	\begin{figure}[H]
Madarasi@7	1388	\begin{tikzpicture}[trim axis left, trim axis right]
Madarasi@7	1389	\begin{axis}[title=Contact maps IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
Madarasi@7	1390	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
Madarasi@7	1391	west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7	1392	format/1000 sep = \thinspace}]
Madarasi@7	1393	%\addplot+[only marks] table {proteinsOrig.txt};
Madarasi@7	1394	\addplot table {Orig/ContactMaps.128.txt};
Madarasi@7	1395	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
Madarasi@7	1396	{VF2PPLabel/ContactMaps.128.txt};
Madarasi@7	1397	\end{axis}
Madarasi@7	1398	\end{tikzpicture}
Madarasi@7	1399	\caption{On contact maps, VF2++ runs in near constant time, while VF2
Madarasi@7	1400	Plus has a near linear behaviour.} \label{fig:INDContact}
Madarasi@7	1401	\end{figure}
Madarasi@7	1402	\end{subfigure}
Madarasi@7	1403
Madarasi@7	1404	\begin{center}
Madarasi@7	1405	\vspace*{-0.5cm}
Madarasi@7	1406	\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7	1407	\begin{figure}[H]
Madarasi@7	1408	\begin{tikzpicture}[trim axis left, trim axis right]
alpar@2	1409	\begin{axis}[title=Proteins IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid
alpar@3	1410	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1411	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1412	format/1000 sep = \thinspace}] %\addplot+[only marks] table
alpar@3	1413	{proteinsOrig.txt}; \addplot[mark=*,mark size=1.2pt,color=blue]
alpar@3	1414	table {Orig/Proteins.256.txt}; \addplot[mark=triangle*,mark
alpar@3	1415	size=1.8pt,color=red] table {VF2PPLabel/Proteins.256.txt};
alpar@2	1416	\end{axis}
alpar@2	1417	\end{tikzpicture}
alpar@3	1418	\caption{Both the algorithms have linear behaviour on protein
alpar@3	1419	graphs. VF2++ is more than 10 times faster than VF2
alpar@3	1420	Plus.} \label{fig:INDProt}
alpar@2	1421	\end{figure}
Madarasi@7	1422	\end{subfigure}
alpar@2	1423	\end{center}
Madarasi@7	1424	\vspace*{-0.5cm}
Madarasi@7	1425	\caption{\normalsize{Graph isomomorphism on biological graphs}}\label{fig:bioIND}
alpar@2	1426	\end{figure}
alpar@2	1427
alpar@2	1428
alpar@2	1429
alpar@2	1430
alpar@2	1431
alpar@2	1432	\subsection{Random graphs}
alpar@3	1433	This section compares VF2++ with VF2 Plus on random graphs of a large
alpar@3	1434	size. The node labels are uniformly distributed. Let $\delta$ denote
alpar@3	1435	the average degree. For the parameters of problems solved in the
alpar@3	1436	experiments, please see the top of each chart.
alpar@2	1437	\subsubsection{Graph isomorphism}
alpar@3	1438	To evaluate the efficiency of the algorithms in the case of graph
alpar@3	1439	isomorphism, connected graphs of less than 20 000 nodes have been
alpar@3	1440	considered. Generating a random graph and shuffling its nodes, an
Madarasi@7	1441	isomorphism had to be found. Figure \ref{fig:randISO} shows the runtime results
alpar@4	1442	on graph sets of various density.
alpar@2	1443
Madarasi@7	1444
Madarasi@7	1445
Madarasi@7	1446
alpar@2	1447	\begin{figure}[H]
Madarasi@7	1448	\vspace*{-1.5cm}
Madarasi@7	1449	\hspace*{-1.5cm}
Madarasi@7	1450	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1451	\begin{center}
alpar@2	1452	\begin{tikzpicture}
Madarasi@7	1453	\begin{axis}[title={Random ISO, $\delta = 5$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1454	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1455	west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7	1456	format/1000 sep = \space}]
alpar@2	1457	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1458	\addplot table {randGraph/iso/vf2pIso5_1.txt};
alpar@3	1459	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1460	{randGraph/iso/vf2ppIso5_1.txt};
alpar@2	1461	\end{axis}
alpar@2	1462	\end{tikzpicture}
alpar@2	1463	\end{center}
Madarasi@7	1464	\end{subfigure}
Madarasi@7	1465	%\hspace{1cm}
Madarasi@7	1466	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1467	\begin{center}
alpar@2	1468	\begin{tikzpicture}
Madarasi@7	1469	\begin{axis}[title={Random ISO, $\delta = 10$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1470	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1471	west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7	1472	format/1000 sep = \space}]
alpar@2	1473	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1474	\addplot table {randGraph/iso/vf2pIso10_1.txt};
alpar@3	1475	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1476	{randGraph/iso/vf2ppIso10_1.txt};
alpar@2	1477	\end{axis}
alpar@2	1478	\end{tikzpicture}
alpar@2	1479	\end{center}
Madarasi@7	1480	\end{subfigure}
Madarasi@7	1481	%%\hspace{1cm}
Madarasi@7	1482	\hspace*{-1.5cm}
Madarasi@7	1483	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1484	\begin{center}
alpar@2	1485	\begin{tikzpicture}
Madarasi@7	1486	\begin{axis}[title={Random ISO, $\delta = 15$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1487	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1488	west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7	1489	format/1000 sep = \space}]
alpar@2	1490	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1491	\addplot table {randGraph/iso/vf2pIso15_1.txt};
alpar@3	1492	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1493	{randGraph/iso/vf2ppIso15_1.txt};
alpar@2	1494	\end{axis}
alpar@2	1495	\end{tikzpicture}
alpar@2	1496	\end{center}
Madarasi@7	1497	\end{subfigure}
Madarasi@7	1498	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1499	\begin{center}
alpar@2	1500	\begin{tikzpicture}
Madarasi@7	1501	\begin{axis}[title={Random ISO, $\delta = 35$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1502	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1503	west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7	1504	format/1000 sep = \space}]
alpar@2	1505	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1506	\addplot table {randGraph/iso/vf2pIso35_1.txt};
alpar@3	1507	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1508	{randGraph/iso/vf2ppIso35_1.txt};
alpar@2	1509	\end{axis}
alpar@2	1510	\end{tikzpicture}
alpar@2	1511	\end{center}
Madarasi@7	1512	\end{subfigure}
Madarasi@7	1513	\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7	1514	\hspace*{-1.5cm}
alpar@2	1515	\begin{tikzpicture}
Madarasi@7	1516	\begin{axis}[title={Random ISO, $\delta = 45$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1517	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1518	west},scaled x ticks = false,x tick label style={/pgf/number
Madarasi@7	1519	format/1000 sep = \space}]
alpar@2	1520	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1521	\addplot table {randGraph/iso/vf2pIso45_1.txt};
alpar@3	1522	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1523	{randGraph/iso/vf2ppIso45_1.txt};
alpar@2	1524	\end{axis}
alpar@2	1525	\end{tikzpicture}
Madarasi@7	1526	\end{subfigure}
Madarasi@7	1527	\hspace*{-1.5cm}
Madarasi@7	1528	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1529	\begin{tikzpicture}
Madarasi@7	1530	\begin{axis}[title={Random ISO, $\delta = 100$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1531	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1532	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1533	format/1000 sep = \thinspace}]
alpar@2	1534	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1535	\addplot table {randGraph/iso/vf2pIso100_1.txt};
alpar@3	1536	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1537	{randGraph/iso/vf2ppIso100_1.txt};
alpar@2	1538	\end{axis}
alpar@2	1539	\end{tikzpicture}
Madarasi@7	1540	\end{subfigure}
alpar@2	1541	\vspace*{-0.8cm}
Madarasi@7	1542	\caption{IND on graphs having an average degree of
Madarasi@7	1543	5.}\label{fig:randISO}
alpar@2	1544	\end{figure}
alpar@2	1545
alpar@2	1546
Madarasi@7	1547
Madarasi@7	1548
Madarasi@7	1549
Madarasi@7	1550
Madarasi@7	1551
Madarasi@7	1552
Madarasi@7	1553
Madarasi@7	1554
alpar@3	1555	Considering the graph isomorphism problem, VF2++ consistently
alpar@3	1556	outperforms its rival especially on sparse graphs. The reason for the
alpar@3	1557	slightly super linear behaviour of VF2++ on denser graphs is the
alpar@3	1558	larger number of nodes in the BFS tree constructed in
alpar@4	1559	Algorithm~\ref{alg:VF2PPPseu}.
alpar@2	1560
alpar@2	1561	\subsubsection{Induced subgraph isomorphism}
alpar@3	1562	This section provides a comparison of VF2++ and VF2 Plus in the case
alpar@3	1563	of induced subgraph isomorphism. In addition to the size of the large
alpar@3	1564	graph, that of the small graph dramatically influences the hardness of
alpar@3	1565	a given problem too, so the overall picture is provided by examining
alpar@3	1566	small graphs of various size.
alpar@2	1567
alpar@3	1568	For each chart, a number $0<\rho< 1$ has been fixed and the following
alpar@3	1569	has been executed 150 times. Generating a large graph $G_{large}$,
alpar@3	1570	choose 10 of its induced subgraphs having $\rho\ \|V_{large}\|$ nodes,
alpar@3	1571	and for all the 10 subgraphs find a mapping by using both the graph
alpar@3	1572	matching algorithms. The $\delta = 5, 10, 35$ and $\rho = 0.05, 0.1,
alpar@4	1573	0.3, 0.6, 0.8, 0.95$ cases have been examined (see
alpar@4	1574	Figure~\ref{fig:randIND5}, \ref{fig:randIND10} and
alpar@4	1575	\ref{fig:randIND35}, and for each $\delta$, a cumulative chart is
alpar@4	1576	given as well, which excludes $\rho = 0.05$ and $0.1$ for the sake of
alpar@4	1577	perspicuity (see Figure~\ref{fig:randIND5Sum}, \ref{fig:randIND10Sum}
alpar@4	1578	and \ref{fig:randIND35Sum}).
alpar@2	1579
alpar@2	1580
alpar@2	1581
alpar@2	1582
alpar@2	1583
alpar@2	1584	\begin{figure}[H]
Madarasi@7	1585	\vspace*{-1.5cm}
Madarasi@7	1586	\hspace*{-1.5cm}
alpar@2	1587	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1588	\begin{center}
alpar@2	1589	\begin{tikzpicture}
alpar@2	1590	\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1591	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1592	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1593	format/1000 sep = \space}]
alpar@2	1594	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1595	\addplot table {randGraph/ind/vf2pInd5_0.05.txt};
alpar@3	1596	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1597	{randGraph/ind/vf2ppInd5_0.05.txt};
alpar@2	1598	\end{axis}
alpar@2	1599	\end{tikzpicture}
alpar@2	1600	\end{center}
alpar@2	1601	\end{subfigure}
alpar@2	1602	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1603	\begin{center}
alpar@2	1604	\begin{tikzpicture}
alpar@2	1605	\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1606	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1607	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1608	format/1000 sep = \space}]
alpar@2	1609	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1610	\addplot table {randGraph/ind/vf2pInd5_0.1.txt};
alpar@3	1611	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1612	{randGraph/ind/vf2ppInd5_0.1.txt};
alpar@2	1613	\end{axis}
alpar@2	1614	\end{tikzpicture}
alpar@2	1615	\end{center}
alpar@2	1616	\end{subfigure}
Madarasi@7	1617	\hspace*{-1.5cm}
alpar@2	1618	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1619	\begin{center}
alpar@2	1620	\begin{tikzpicture}
alpar@2	1621	\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1622	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1623	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1624	format/1000 sep = \space}]
alpar@2	1625	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1626	\addplot table {randGraph/ind/vf2pInd5_0.3.txt};
alpar@3	1627	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1628	{randGraph/ind/vf2ppInd5_0.3.txt};
alpar@2	1629	\end{axis}
alpar@2	1630	\end{tikzpicture}
alpar@2	1631	\end{center}
alpar@2	1632	\end{subfigure}
alpar@2	1633	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1634	\begin{center}
alpar@2	1635	\begin{tikzpicture}
alpar@2	1636	\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.6$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1637	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1638	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1639	format/1000 sep = \space}]
alpar@2	1640	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1641	\addplot table {randGraph/ind/vf2pInd5_0.6.txt};
alpar@3	1642	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1643	{randGraph/ind/vf2ppInd5_0.6.txt};
alpar@2	1644	\end{axis}
alpar@2	1645	\end{tikzpicture}
alpar@2	1646	\end{center}
alpar@2	1647	\end{subfigure}
alpar@2	1648	\begin{subfigure}[b]{0.55\textwidth}
Madarasi@7	1649	\hspace*{-1.5cm}
alpar@2	1650	\begin{tikzpicture}
alpar@2	1651	\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1652	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1653	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1654	format/1000 sep = \space}]
alpar@2	1655	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1656	\addplot table {randGraph/ind/vf2pInd5_0.8.txt};
alpar@3	1657	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1658	{randGraph/ind/vf2ppInd5_0.8.txt};
alpar@2	1659	\end{axis}
alpar@2	1660	\end{tikzpicture}
alpar@2	1661	\end{subfigure}
Madarasi@7	1662	\hspace*{-1.5cm}
alpar@2	1663	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1664	\begin{tikzpicture}
alpar@2	1665	\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.95$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1666	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1667	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1668	format/1000 sep = \thinspace}]
alpar@2	1669	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1670	\addplot table {randGraph/ind/vf2pInd5_0.95.txt};
alpar@3	1671	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1672	{randGraph/ind/vf2ppInd5_0.95.txt};
alpar@2	1673	\end{axis}
alpar@2	1674	\end{tikzpicture}
alpar@2	1675	\end{subfigure}
alpar@2	1676	\vspace*{-0.8cm}
alpar@3	1677	\caption{IND on graphs having an average degree of
alpar@3	1678	5.}\label{fig:randIND5}
alpar@2	1679	\end{figure}
alpar@2	1680
alpar@2	1681	\begin{figure}[H]
alpar@2	1682	\begin{center}
Madarasi@7	1683	\hspace*{-2cm}
alpar@2	1684	\begin{tikzpicture}
alpar@2	1685	\begin{axis}[title={Rand IND Summary, $\delta = 5$, $\rho = 0.3, 0.6, 0.8, 0.95$},height=17cm,width=16cm,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},line width=0.8pt,grid
alpar@3	1686	=major,mark size=1pt, legend style={at={(0,1)},anchor=north
alpar@3	1687	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1688	format/1000 sep = \thinspace}]
alpar@2	1689	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@3	1690	\addplot[mark=*,mark size=1.5pt,color=blue] table
alpar@3	1691	{randGraph/ind/vf2pInd5_0.3.txt}; \addplot[mark=triangle*,mark
alpar@3	1692	size=1.8pt,color=red] table
alpar@3	1693	{randGraph/ind/vf2ppInd5_0.3.txt}; \addplot[mark=*,mark
alpar@3	1694	size=1.5pt,color=blue] table
alpar@3	1695	{randGraph/ind/vf2pInd5_0.6.txt}; \addplot[mark=triangle*,mark
alpar@3	1696	size=1.8pt,color=red] table
alpar@3	1697	{randGraph/ind/vf2ppInd5_0.6.txt}; \addplot[mark=*,mark
alpar@3	1698	size=1.5pt,color=blue] table
alpar@3	1699	{randGraph/ind/vf2pInd5_0.8.txt}; \addplot[mark=triangle*,mark
alpar@3	1700	size=1.8pt,color=red] table
alpar@3	1701	{randGraph/ind/vf2ppInd5_0.8.txt}; \addplot[mark=*,mark
alpar@3	1702	size=1.5pt,color=blue] table
alpar@3	1703	{randGraph/ind/vf2pInd5_0.95.txt};
alpar@3	1704	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1705	{randGraph/ind/vf2ppInd5_0.95.txt};
alpar@2	1706	\end{axis}
alpar@2	1707	\end{tikzpicture}
alpar@2	1708	\end{center}
alpar@2	1709	\vspace*{-0.8cm}
alpar@2	1710	\caption{Cummulative chart for $\delta=5$.}\label{fig:randIND5Sum}
alpar@2	1711	\end{figure}
alpar@2	1712
alpar@2	1713
alpar@2	1714
alpar@2	1715	\begin{figure}[H]
Madarasi@7	1716	\vspace*{-1.5cm}
Madarasi@7	1717	\hspace*{-1.5cm}
alpar@2	1718	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1719	\begin{center}
Madarasi@7	1720	\hspace*{-0.5cm}
alpar@2	1721	\begin{tikzpicture}
alpar@2	1722	\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1723	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1724	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1725	format/1000 sep = \space}]
alpar@2	1726	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1727	\addplot table {randGraph/ind/vf2pInd10_0.05.txt};
alpar@3	1728	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1729	{randGraph/ind/vf2ppInd10_0.05.txt};
alpar@2	1730	\end{axis}
alpar@2	1731	\end{tikzpicture}
alpar@2	1732	\end{center}
alpar@2	1733	\end{subfigure}
alpar@2	1734	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1735	\begin{center}
Madarasi@7	1736	\hspace*{-0.5cm}
alpar@2	1737	\begin{tikzpicture}
alpar@2	1738	\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1739	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1740	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1741	format/1000 sep = \space}]
alpar@2	1742	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1743	\addplot table {randGraph/ind/vf2pInd10_0.1.txt};
alpar@3	1744	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1745	{randGraph/ind/vf2ppInd10_0.1.txt};
alpar@2	1746	\end{axis}
alpar@2	1747	\end{tikzpicture}
alpar@2	1748	\end{center}
alpar@2	1749	\end{subfigure}
Madarasi@7	1750	\hspace*{-1.5cm}
alpar@2	1751	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1752	\begin{center}
alpar@2	1753	\begin{tikzpicture}
alpar@2	1754	\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1755	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1756	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1757	format/1000 sep = \space}]
alpar@2	1758	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1759	\addplot table {randGraph/ind/vf2pInd10_0.3.txt};
alpar@3	1760	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1761	{randGraph/ind/vf2ppInd10_0.3.txt};
alpar@2	1762	\end{axis}
alpar@2	1763	\end{tikzpicture}
alpar@2	1764	\end{center}
alpar@2	1765	\end{subfigure}
alpar@2	1766	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1767	\begin{center}
alpar@2	1768	\begin{tikzpicture}
alpar@2	1769	\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.6$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1770	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1771	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1772	format/1000 sep = \space}]
alpar@2	1773	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1774	\addplot table {randGraph/ind/vf2pInd10_0.6.txt};
alpar@3	1775	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1776	{randGraph/ind/vf2ppInd10_0.6.txt};
alpar@2	1777	\end{axis}
alpar@2	1778	\end{tikzpicture}
alpar@2	1779	\end{center}
alpar@2	1780	\end{subfigure}
Madarasi@7	1781	\hspace*{-1.5cm}
alpar@2	1782	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1783	\begin{tikzpicture}
alpar@2	1784	\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1785	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1786	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1787	format/1000 sep = \space}]
alpar@2	1788	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1789	\addplot table {randGraph/ind/vf2pInd10_0.8.txt};
alpar@3	1790	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1791	{randGraph/ind/vf2ppInd10_0.8.txt};
alpar@2	1792	\end{axis}
alpar@2	1793	\end{tikzpicture}
alpar@2	1794	\end{subfigure}
alpar@2	1795	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1796	\begin{tikzpicture}
alpar@2	1797	\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.95$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1798	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1799	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1800	format/1000 sep = \thinspace}]
alpar@2	1801	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1802	\addplot table {randGraph/ind/vf2pInd10_0.95.txt};
alpar@3	1803	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1804	{randGraph/ind/vf2ppInd10_0.95.txt};
alpar@2	1805	\end{axis}
alpar@2	1806	\end{tikzpicture}
alpar@2	1807	\end{subfigure}
alpar@2	1808	\vspace*{-0.8cm}
alpar@3	1809	\caption{IND on graphs having an average degree of
alpar@3	1810	10.}\label{fig:randIND10}
alpar@2	1811	\end{figure}
alpar@2	1812
alpar@2	1813	\begin{figure}[H]
alpar@2	1814	\begin{center}
Madarasi@7	1815	\hspace*{-2cm}
alpar@2	1816	\begin{tikzpicture}
alpar@2	1817	\begin{axis}[title={Rand IND Summary, $\delta = 10$, $\rho = 0.3, 0.6, 0.8, 0.95$},height=17cm,width=16cm,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},line width=0.8pt,grid
alpar@3	1818	=major,mark size=1pt, legend style={at={(0,1)},anchor=north
alpar@3	1819	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1820	format/1000 sep = \thinspace}]
alpar@2	1821	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@3	1822	\addplot[mark=*,mark size=1.5pt,color=blue] table
alpar@3	1823	{randGraph/ind/vf2pInd10_0.3.txt};
alpar@3	1824	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1825	{randGraph/ind/vf2ppInd10_0.3.txt};
alpar@3	1826	\addplot[mark=*,mark size=1.5pt,color=blue] table
alpar@3	1827	{randGraph/ind/vf2pInd10_0.6.txt};
alpar@3	1828	\addplot[mark=triangle*,mark
alpar@3	1829	size=1.8pt,color=red] table
alpar@3	1830	{randGraph/ind/vf2ppInd10_0.6.txt};
alpar@3	1831	\addplot[mark=*,mark
alpar@3	1832	size=1.5pt,color=blue] table
alpar@3	1833	{randGraph/ind/vf2pInd10_0.8.txt};
alpar@3	1834	\addplot[mark=triangle*,mark
alpar@3	1835	size=1.8pt,color=red] table
alpar@3	1836	{randGraph/ind/vf2ppInd10_0.8.txt};
alpar@3	1837	\addplot[mark=*,mark
alpar@3	1838	size=1.5pt,color=blue]
alpar@3	1839	table
alpar@3	1840	{randGraph/ind/vf2pInd10_0.95.txt};
alpar@3	1841	\addplot[mark=triangle*,mark
alpar@3	1842	size=1.8pt,color=red]
alpar@3	1843	table
alpar@3	1844	{randGraph/ind/vf2ppInd10_0.95.txt};
alpar@2	1845	\end{axis}
alpar@2	1846	\end{tikzpicture}
alpar@2	1847	\end{center}
alpar@2	1848	\vspace*{-0.8cm}
alpar@2	1849	\caption{Cummulative chart for $\delta=10$.}\label{fig:randIND10Sum}
alpar@2	1850	\end{figure}
alpar@2	1851
alpar@2	1852
alpar@2	1853
alpar@2	1854	\begin{figure}[H]
Madarasi@7	1855	\vspace*{-1.5cm}
Madarasi@7	1856	\hspace*{-1.5cm}
alpar@2	1857	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1858	\begin{center}
alpar@2	1859	\begin{tikzpicture}
alpar@2	1860	\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1861	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1862	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1863	format/1000 sep = \space}]
alpar@2	1864	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1865	\addplot table {randGraph/ind/vf2pInd35_0.05.txt};
alpar@3	1866	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1867	{randGraph/ind/vf2ppInd35_0.05.txt};
alpar@2	1868	\end{axis}
alpar@2	1869	\end{tikzpicture}
alpar@2	1870	\end{center}
alpar@2	1871	\end{subfigure}
alpar@2	1872	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1873	\begin{center}
alpar@2	1874	\begin{tikzpicture}
alpar@2	1875	\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1876	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1877	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1878	format/1000 sep = \space}]
alpar@2	1879	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1880	\addplot table {randGraph/ind/vf2pInd35_0.1.txt};
alpar@3	1881	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1882	{randGraph/ind/vf2ppInd35_0.1.txt};
alpar@2	1883	\end{axis}
alpar@2	1884	\end{tikzpicture}
alpar@2	1885	\end{center}
alpar@2	1886	\end{subfigure}
Madarasi@7	1887	\hspace*{-1.5cm}
alpar@2	1888	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1889	\begin{center}
alpar@2	1890	\begin{tikzpicture}
alpar@2	1891	\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1892	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1893	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1894	format/1000 sep = \space}]
alpar@2	1895	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1896	\addplot table {randGraph/ind/vf2pInd35_0.3.txt};
alpar@3	1897	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1898	{randGraph/ind/vf2ppInd35_0.3.txt};
alpar@2	1899	\end{axis}
alpar@2	1900	\end{tikzpicture}
alpar@2	1901	\end{center}
alpar@2	1902	\end{subfigure}
alpar@2	1903	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1904	\begin{center}
alpar@2	1905	\begin{tikzpicture}
alpar@2	1906	\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.6$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1907	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1908	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1909	format/1000 sep = \space}]
alpar@2	1910	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1911	\addplot table {randGraph/ind/vf2pInd35_0.6.txt};
alpar@3	1912	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1913	{randGraph/ind/vf2ppInd35_0.6.txt};
alpar@2	1914	\end{axis}
alpar@2	1915	\end{tikzpicture}
alpar@2	1916	\end{center}
alpar@2	1917	\end{subfigure}
Madarasi@7	1918	\hspace*{-1.5cm}
alpar@2	1919	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1920	\begin{tikzpicture}
alpar@2	1921	\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1922	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1923	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1924	format/1000 sep = \space}]
alpar@2	1925	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1926	\addplot table {randGraph/ind/vf2pInd35_0.8.txt};
alpar@3	1927	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1928	{randGraph/ind/vf2ppInd35_0.8.txt};
alpar@2	1929	\end{axis}
alpar@2	1930	\end{tikzpicture}
alpar@2	1931	\end{subfigure}
alpar@2	1932	\begin{subfigure}[b]{0.55\textwidth}
alpar@2	1933	\begin{tikzpicture}
alpar@2	1934	\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.95$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid
alpar@3	1935	=major,mark size=1.2pt, legend style={at={(0,1)},anchor=north
alpar@3	1936	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1937	format/1000 sep = \thinspace}]
alpar@2	1938	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@2	1939	\addplot table {randGraph/ind/vf2pInd35_0.95.txt};
alpar@3	1940	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1941	{randGraph/ind/vf2ppInd35_0.95.txt};
alpar@2	1942	\end{axis}
alpar@2	1943	\end{tikzpicture}
alpar@2	1944	\end{subfigure}
alpar@2	1945	\vspace*{-0.8cm}
alpar@3	1946	\caption{IND on graphs having an average degree of
alpar@3	1947	35.}\label{fig:randIND35}
alpar@2	1948	\end{figure}
alpar@2	1949
alpar@2	1950	\begin{figure}[H]
alpar@2	1951	\begin{center}
Madarasi@7	1952	\hspace*{-2cm}
alpar@2	1953	\begin{tikzpicture}
alpar@2	1954	\begin{axis}[title={Rand IND Summary, $\delta = 35$, $\rho = 0.3, 0.6, 0.8, 0.95$},height=17cm,width=16cm,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},line width=0.8pt,grid
alpar@3	1955	=major,mark size=1pt, legend style={at={(0,1)},anchor=north
alpar@3	1956	west},scaled x ticks = false,x tick label style={/pgf/number
alpar@3	1957	format/1000 sep = \thinspace}]
alpar@2	1958	%\addplot+[only marks] table {proteinsOrig.txt};
alpar@3	1959	\addplot[mark=*,mark size=1.5pt,color=blue] table
alpar@3	1960	{randGraph/ind/vf2pInd35_0.3.txt};
alpar@3	1961	\addplot[mark=triangle*,mark size=1.8pt,color=red] table
alpar@3	1962	{randGraph/ind/vf2ppInd35_0.3.txt};
alpar@3	1963	\addplot[mark=*,mark size=1.5pt,color=blue] table
alpar@3	1964	{randGraph/ind/vf2pInd35_0.6.txt};
alpar@3	1965	\addplot[mark=triangle*,mark
alpar@3	1966	size=1.8pt,color=red] table
alpar@3	1967	{randGraph/ind/vf2ppInd35_0.6.txt};
alpar@3	1968	\addplot[mark=*,mark
alpar@3	1969	size=1.5pt,color=blue] table
alpar@3	1970	{randGraph/ind/vf2pInd35_0.8.txt};
alpar@3	1971	\addplot[mark=triangle*,mark
alpar@3	1972	size=1.8pt,color=red] table
alpar@3	1973	{randGraph/ind/vf2ppInd35_0.8.txt};
alpar@3	1974	\addplot[mark=*,mark
alpar@3	1975	size=1.5pt,color=blue]
alpar@3	1976	table
alpar@3	1977	{randGraph/ind/vf2pInd35_0.95.txt};
alpar@3	1978	\addplot[mark=triangle*,mark
alpar@3	1979	size=1.8pt,color=red]
alpar@3	1980	table
alpar@3	1981	{randGraph/ind/vf2ppInd35_0.95.txt};
alpar@2	1982	\end{axis}
alpar@2	1983	\end{tikzpicture}
alpar@2	1984	\end{center}
alpar@2	1985	\vspace*{-0.8cm}
alpar@2	1986	\caption{Cummulative chart for $\delta=35$.}\label{fig:randIND35Sum}
alpar@2	1987	\end{figure}
alpar@2	1988
alpar@3	1989	Based on these experiments, VF2++ is faster than VF2 Plus and able to
alpar@3	1990	handle really large graphs in milliseconds. Note that when $IND$ was
alpar@3	1991	considered and the small graphs had proportionally few nodes ($\rho =
alpar@3	1992	0.05$, or $\rho = 0.1$), then VF2 Plus produced some inefficient node
alpar@4	1993	orders (e.g. see the $\delta=10$ case on
alpar@4	1994	Figure~\ref{fig:randIND10}). If these examples had been excluded, the
alpar@3	1995	charts would have seemed to be similar to the other ones.
alpar@3	1996	Unsurprisingly, as denser graphs are considered, both VF2++ and VF2
alpar@3	1997	Plus slow slightly down, but remain practically usable even on graphs
alpar@3	1998	having 10 000 nodes.
alpar@2	1999
alpar@2	2000
alpar@2	2001
alpar@2	2002
alpar@3	2003
alpar@2	2004	\section{Conclusion}
alpar@3	2005	In this paper, after providing a short summary of the recent
alpar@3	2006	algorithms, a new graph matching algorithm based on VF2, called VF2++,
alpar@3	2007	has been presented and analyzed from a practical viewpoint.
alpar@2	2008
alpar@3	2009	Recognizing the importance of the node order and determining an
alpar@3	2010	efficient one, VF2++ is able to match graphs of thousands of nodes in
alpar@3	2011	near practically linear time including preprocessing. In addition to
alpar@3	2012	the proper order, VF2++ uses more efficient consistency and cutting
alpar@3	2013	rules which are easy to compute and make the algorithm able to prune
alpar@3	2014	most of the unfruitful branches without going astray.
alpar@2	2015
alpar@3	2016	In order to show the efficiency of the new method, it has been
alpar@3	2017	compared to VF2 Plus, which is the best concurrent algorithm based on
alpar@3	2018	\cite{VF2Plus}.
alpar@2	2019
alpar@3	2020	The experiments show that VF2++ consistently outperforms VF2 Plus on
alpar@3	2021	biological graphs. It seems to be asymptotically faster on protein and
alpar@3	2022	on contact map graphs in the case of induced subgraph isomorphism,
alpar@3	2023	while in the case of graph isomorphism, it has definitely better
alpar@3	2024	asymptotic behaviour on protein graphs.
alpar@2	2025
alpar@3	2026	Regarding random sparse graphs, not only has VF2++ proved itself to be
alpar@3	2027	faster than VF2 Plus, but it has a practically linear behaviour both
alpar@3	2028	in the case of induced subgraph- and graph isomorphism, as well.
alpar@2	2029
alpar@2	2030
alpar@0	2031
alpar@0	2032	%% The Appendices part is started with the command \appendix;
alpar@0	2033	%% appendix sections are then done as normal sections
alpar@0	2034	%% \appendix
alpar@0	2035
alpar@0	2036	%% \section{}
alpar@0	2037	%% \label{}
alpar@0	2038
alpar@0	2039	%% If you have bibdatabase file and want bibtex to generate the
alpar@0	2040	%% bibitems, please use
alpar@0	2041	%%
alpar@3	2042	\bibliographystyle{elsarticle-num} \bibliography{bibliography}
alpar@0	2043
alpar@0	2044	%% else use the following coding to input the bibitems directly in the
alpar@0	2045	%% TeX file.
alpar@0	2046
alpar@2	2047	%% \begin{thebibliography}{00}
alpar@0	2048
alpar@2	2049	%% %% \bibitem{label}
alpar@2	2050	%% %% Text of bibliographic item
alpar@0	2051
alpar@2	2052	%% \bibitem{}
alpar@0	2053
alpar@2	2054	%% \end{thebibliography}
alpar@2	2055
alpar@0	2056	\end{document}
alpar@0	2057	\endinput
alpar@0	2058	%%
alpar@0	2059	%% End of file `elsarticle-template-num.tex'.

author	Madarasi Peter
	Wed, 23 Nov 2016 02:07:56 +0100
changeset 7	4989d677d5e3
parent 5	c04b029e623a
child 8	49574b75404b
permissions	-rw-r--r--