# HG changeset patch # User Madarasi Peter # Date 1500760800 -7200 # Node ID 523fddfd7a01ccc4e41c8f2e8fe4dc4459142abe # Parent 497868c58d361c9a155c540c51e997783e4592df Meet Referee's requests diff -r 497868c58d36 -r 523fddfd7a01 bibliography.bib --- a/bibliography.bib Wed Nov 30 23:26:43 2016 +0100 +++ b/bibliography.bib Sun Jul 23 00:00:00 2017 +0200 @@ -10,16 +10,16 @@ %I think it's good enough. @Article{CordellaVentoSymbolRecognition, - author = "Cordella,L.P. and Vento, M.", + author = "L. P. Cordella and Vento, M.", title = "Symbol recognition in documents: a collection of techniques?", - journal = "International Journal on Document Analysis and Recognition Volume 3, Issue 2, pp 73-88, {December 2000}", + journal = "International Journal on Document Analysis and Recognition Volume 3, Issue 2, Pages 73-88, 2000", year = 2000, month = "December" } @Article{ProteinDataBank, author = "", - title = "Protein Data Bank, http://www.rcsb.org/pdb, {June 2015}", + title = "Protein Data Bank, http://www.rcsb.org/pdb, 2015", journal = "", year = 2015, month = "June" @@ -27,7 +27,7 @@ @Article{LEMON, author = "", - title = "LEMON: Library for Efficient Modeling and Optimization in Networks, https://lemon.cs.elte.hu", + title = "{LEMON}: Library for Efficient Modeling and Optimization in Networks, https://lemon.cs.elte.hu", journal = "", year = 2015, month = "March" @@ -35,27 +35,27 @@ @Article{QUANTUMBIO, author = "", - title = "QuantumBio Inc., http://www.quantumbioinc.com/" + title = "{QuantumBio Inc.}, http://www.quantumbioinc.com" } @Article{Content, author = "Vento, Mario and Jiang, Xiaoyi and Foggia, Pasquale", - title = "International contest on pattern search in biological databases, http://biograph2014.unisa.it, {June 2015}", + title = "International contest on pattern search in biological databases, http://biograph2014.unisa.it, 2015", journal = "", year = 2015, month = "June" } @Article{VF, - author = "Cordella, L. P. and Foggia, P. and Sansone, C. and Vento, M.", + author = "L. P. Cordella and Foggia, P. and Sansone, C. and Vento, M.", title = "Performance Evaluation of the {VF} Graph Matching Algorithm", - journal = "Proc. of the 10th ICIAP, IEEE Computer Society Press, pp. 1172-1177, 1999", + journal = "Proc. of the 10th ICIAP, IEEE Computer Society Press, Pages 1172-1177, 1999", year = 1999, month = "" } @Article{UllmannBit, - author = "Julian R. Ullmann", + author = "{J. R. Ullmann}", title = "Bit-vector algorithms for binary constraint satisfaction and subgraph isomorphism", journal = "Journal of Experimental Algorithmics (JEA),Volume 15, Article No. 1.6, ACM New York, NY, USA, 2010", year = 2010, @@ -65,31 +65,31 @@ @Article{Ullmann, author = "J. R. Ullmann", title = "An Algorithm for Subgraph Isomorphism", - journal = "Journal of the ACM (JACM), Volume 23 Issue 1, Pages 31-42, {January 1976}", + journal = "Journal of the ACM (JACM), Volume 23 Issue 1, Pages 31-42, 1976", year = 1976, month = "January" } @Article{SubgraphNPC, - author = "Cook, S. A.", + author = "S. A. Cook", title = "The complexity of theorem-proving procedures", - journal = "Proc. 3rd ACM Symposium on Theory of Computing, pp. 151-158, 1971", + journal = "Proc. 3rd ACM Symposium on Theory of Computing, Pages 151-158, 1971", year = 1971, month = "" } @Article{VF2, - author = "Cordella, L.P. and Foggia, P. and Sansone, C. and Vento, M.", + author = "L. P. Cordella and Foggia, P. and Sansone, C. and Vento, M.", title = "A (sub)graph isomorphism algorithm for matching large graphs ", - journal = "Journal of the ACM (JACM) JACM Homepage archive Volume 23 Issue 1, Pages 31-42, 2004", + journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence Volume 26 Issue 10, Page 1367-1372, 2004", year = 2004, month = "" } @Article{VF2Plus, - author = "Carletti Vincenzo and Pasquale Foggia and Mario Vento", + author = "Carletti, Vincenzo and Foggia, Pasquale and Vento, Mario", title = "{VF2 Plus}: An Improved Version of {VF2} For Biological Graphs", - journal = "Conference: Graph-Based Representations in Pattern Recognition, At Beijing, {May 2015}", + journal = "Conference: Graph-Based Representations in Pattern Recognition, At Beijing, 2015", year = 2015, month = "May" } @@ -97,30 +97,30 @@ @Article{RI, author = "Vincenzo Bonnici and Rosalba Giugno and Alfredo Pulvirenti and Dennis Shasha and Alfredo Ferro", title = "A subgraph isomorphism algorithm and its application to biochemical data", - journal = "BMC Bioinformatics. 14(Suppl 7): S13., {April 2013}", + journal = "BMC Bioinformatics. 14(Suppl 7): S13., 2013", year = 2013, month = "April" } @Article{Lad, - author = "Brendan D. McKay", + author = "Christine Solnon", title = "AllDifferent-based filtering for subgraph isomorphism", - journal = "Artificial Intelligence 174,850-864, 2010", + journal = "Artificial Intelligence 174, Pages 850-864, 2010", year = 2010, month = "" } @Article{Nauty, - author = "Christine Solnon", + author = "B. D. McKay", title = "Practical Graph Isomorphism", - journal = "Congressus Numerantium, vol.30, pp. 45-87, 1981", + journal = "Congressus Numerantium, Volume 30, Pages 45-87, 1981", year = 1981, month = "" } @Article{JianzhuangYongFaceIdentification, author = "Jianzhuang Liu and Yong Tsui Lee", - title = "A Graph-Based Method for Face Identification from a Single 2D Line Drawing", - journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence - Graph Algorithms and Computer Vision archive Volume 23 Issue 10, {October 2001}", + title = "A Graph-Based Method for Face Identification from a Single {2D} Line Drawing", + journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence - Graph Algorithms and Computer Vision archive Volume 23 Issue 10, Pages 1106-1119, 2001", year = 2001, month = "October" } @@ -128,7 +128,7 @@ @Article{HorstBunkeApplications, author = "Horst Bunke", title = "Graph Matching: Theoretical Foundations, Algorithms, and Applications", - journal = "In International Conference on Vision Interface, pp. 82-84, May 2000", + journal = "In International Conference on Vision Interface, Pages 82-84, 2000", year = 2000, month = "May" } @@ -136,7 +136,7 @@ @Article{AlexandruApplications, author = "Alexandru T. Balaban", title = "Applications of graph theory in chemistry", - journal = "J. Chem. Inf. Comput. Sci.,25(3),pp 334-343, March 1985", + journal = "J. Chem. Inf. Comput. Sci.,25(3), Pages 334-343, 1985", year = 1985, month = "March" } @@ -144,7 +144,7 @@ @Article{PlanarGraphIso, author = "J. E. Hopcroft and J. K. Wong", title = "Linear time algorithm for isomorphism of planar graphs", - journal = "Proceeding STOC '74 Proceedings of the sixth annual ACM symposium on Theory of computing, Pages 172-184, April 1974", + journal = "Proceeding STOC '74 Proceedings of the sixth annual ACM symposium on Theory of computing, Pages 172-184, 1974", year = 1974, month = "April" } @@ -152,7 +152,7 @@ @Article{BondedDegGraphIso, author = "Eugene M. Luks", title = "Isomorphism of Graphs of Bounded Valence Can Be Tested in Polynomial Time", - journal = "Journal of Computer and System Sciences, Volume 25, Issue 1, Pages 42-65, August 1982", + journal = "Journal of Computer and System Sciences, Volume 25, Issue 1, Pages 42-65, 1982", year = 1982, month = "August" } @@ -160,7 +160,7 @@ @Article{IntervalGraphIso, author = "George S. Lue and Kellogg S. Booth", title = "A Linear Time Algorithm for Deciding Interval Graph Isomorphism", - journal = "Journal of the ACM (JACM), Volume 26, Issue 2, Pages 183-195, April 1979", + journal = "Journal of the ACM (JACM), Volume 26, Issue 2, Pages 183-195, 1979", year = 1979, month = "April" } @@ -168,15 +168,15 @@ @Article{PermGraphIso, author = "Charles J. Colbourn", title = "On testing isomorphism of permutation graphs", - journal = "Networks, Volume 11, Issue 1, Pages 13-21, March 1981", + journal = "Networks, Volume 11, Issue 1, Pages 13-21, 1981", year = 1981, month = "March" } @Article{ColoredHiperGraphIso, - author = "Arvind,V. and Das,B. and Köbler,J. and et al.", + author = "Arvind,V. and Das,B. and {K\"obler},J. and S. Toda", title = "Colored Hypergraph Isomorphism is Fixed Parameter Tractable", - journal = "Algorithmica Volume 71, pp 120-138, January 2015", + journal = "Algorithmica Volume 71, Pages 120-138, 2015", year = 2015, month = "January" } diff -r 497868c58d36 -r 523fddfd7a01 damecco.tex --- a/damecco.tex Wed Nov 30 23:26:43 2016 +0100 +++ b/damecco.tex Sun Jul 23 00:00:00 2017 +0200 @@ -126,18 +126,18 @@ nevertheless easier to compute - cutting rules significantly reducing the search space are applied. - In addition to the usual subgraph isomorphism, the paper also - presents specialized versions for the \emph{Induced Subgraph + In addition to the usual \emph{Subgraph Isomorphism Problem}, the paper also + presents specialized algorithms for the \emph{Induced Subgraph Isomorphism} and for the \emph{Graph Isomorphism Problems}. Finally, an extensive experimental evaluation is provided using a - wide range of inputs, including both real life biological and + wide range of inputs, including both real-life biological and chemical datasets and standard randomly generated graph series. The results show major and consistent running time improvements over the other known methods. - The C++ implementations of the algorithms are available open source as - the part of the LEMON graph and network optimization library. + The C++ implementations of the algorithms are available open-source as + part of the LEMON graph and network optimization library. \end{abstract} @@ -172,9 +172,9 @@ systems at the molecular level is of primary importance, since protein-protein interaction, DNA-protein interaction, metabolic interaction, transcription factor binding, neuronal networks, and -hormone signaling networks can be understood this way. +hormone signalling networks can be understood this way. -Many chemical and biological structures can easily be modeled +Many chemical and biological structures can easily be modelled as graphs, for instance, a molecular structure can be considered as a graph, whose nodes correspond to atoms and whose edges to chemical bonds. The similarity and dissimilarity of @@ -185,18 +185,16 @@ Other real-world fields related to some variants of graph matching include pattern recognition -and machine vision \cite{HorstBunkeApplications}, symbol recognition -\cite{CordellaVentoSymbolRecognition}, face identification -\cite{JianzhuangYongFaceIdentification}. \\ +and machine vision~\cite{HorstBunkeApplications}, symbol recognition~\cite{CordellaVentoSymbolRecognition}, and face identification~\cite{JianzhuangYongFaceIdentification}. \\ Subgraph and induced subgraph matching problems are known to be -NP-Complete\cite{SubgraphNPC}, while the graph isomorphism problem is +NP-Complete~\cite{SubgraphNPC}, while the graph isomorphism problem is one of the few problems in NP neither known to be in P nor -NP-Complete. Although polynomial time isomorphism algorithms are known +NP-Complete. Although polynomial-time isomorphism algorithms are known for various graph classes, like trees and planar -graphs\cite{PlanarGraphIso}, bounded valence -graphs\cite{BondedDegGraphIso}, interval graphs\cite{IntervalGraphIso} -or permutation graphs\cite{PermGraphIso}, and recently, an FPT algorithm has been presented for the coloured hypergraph isomorphism problem in \cite{ColoredHiperGraphIso}. +graphs~\cite{PlanarGraphIso}, bounded valence +graphs~\cite{BondedDegGraphIso}, interval graphs~\cite{IntervalGraphIso} +or permutation graphs~\cite{PermGraphIso}. Furthermore, an FPT algorithm has also been presented for the coloured hypergraph isomorphism problem in~\cite{ColoredHiperGraphIso}. In the following, some algorithms based on other approaches are summarized, which do not need any restrictions on the graphs. Even though, @@ -206,52 +204,50 @@ algorithm is known. The first practically usable approach was due to -\emph{Ullmann}\cite{Ullmann} which is a commonly used depth-first -search based algorithm with a complex heuristic for reducing the +\emph{Ullmann}~\cite{Ullmann}, which is a commonly used algorithm based on depth-first +search with a complex heuristic for reducing the number of visited states. A major problem is its $\Theta(n^3)$ space complexity, which makes it impractical in the case of big sparse graphs. -In a recent paper, Ullmann\cite{UllmannBit} presents an +In a recent paper, Ullmann~\cite{UllmannBit} presents an improved version of this algorithm based on a bit-vector solution for the binary Constraint Satisfaction Problem. -The \emph{Nauty} algorithm\cite{Nauty} transforms the two graphs to -a canonical form before starting to check for the isomorphism. It has +The \emph{Nauty} algorithm~\cite{Nauty} transforms the two graphs to +a canonical form before starting to check for isomorphism. It has been considered as one of the fastest graph isomorphism algorithms, although graph categories were shown in which it takes exponentially many steps. This algorithm handles only the graph isomorphism problem. -The \emph{LAD} algorithm\cite{Lad} uses a depth-first search +The \emph{LAD} algorithm~\cite{Lad} uses a depth-first search strategy and formulates the matching as a Constraint Satisfaction Problem to prune the search tree. The constraints are that the mapping has to be injective and edge-preserving, hence it is possible to handle new matching types as well. -The \emph{RI} algorithm\cite{RI} and its variations are based on a +The \emph{RI} algorithm~\cite{RI} and its variations are based on a state space representation. After reordering the nodes of the graphs, it uses some fast executable heuristic checks without using any complex pruning rules. It seems to run really efficiently on graphs coming from biology, and won the International Contest on Pattern -Search in Biological Databases\cite{Content}. +Search in Biological Databases~\cite{Content}. -The currently most commonly used algorithm is the -\emph{VF2}\cite{VF2}, the improved version of \emph{VF}\cite{VF}, which was +Currently, the most commonly used algorithm is the +\emph{VF2}~\cite{VF2}, the improved version of \emph{VF}~\cite{VF}, which was designed for solving pattern matching and computer vision problems, and has been one of the best overall algorithms for more than a -decade. Although, it can't be up to new specialized algorithms, it is -still widely used due to its simplicity and space efficiency. VF2 uses +decade. Although, it is not as fast as some of the new specialized algorithms, it is still widely used due to its simplicity and space efficiency. VF2 uses a state space representation and checks some conditions in each state to prune the search tree. -Meanwhile, another variant called \emph{VF2 Plus}\cite{VF2Plus} has +Meanwhile, another variant called \emph{VF2~Plus}~\cite{VF2Plus} has been published. It is considered to be as efficient as the RI -algorithm and has a strictly better behavior on large graphs. The -main idea of VF2 Plus is to precompute a heuristic node order of the -small graph, in which the VF2 works more efficiently. +algorithm and has a strictly better behaviour on large graphs. The +main idea of VF2~Plus is to precompute a heuristic node order of the graph to be embedded, in which VF2 works more efficiently. This paper introduces \emph{VF2++}, a new further improved algorithm -for the graph and (induced)subgraph isomorphism problem, which uses +for the graph and (induced) subgraph isomorphism problems, which uses efficient cutting rules and determines a node order in which VF2 runs significantly faster on practical inputs. @@ -261,12 +257,12 @@ on that, Section~\ref{sec:VF2ppAlg} introduces VF2++. Some technical details necessary for an efficient implementation are discussed in Section~\ref{sec:VF2ppImpl}. Finally, Section~\ref{sec:ExpRes} -provide a detailed experimental evaluation of VF2++ and its comparison +provides a detailed experimental evaluation of VF2++ and its comparison to the state-of-the-art algorithm. It must also be mentioned that the C++ implementations of the algorithms have been made available for evaluation and use under an -open source license as a part of LEMON\cite{LEMON} open source graph +open-source license as a part of LEMON~\cite{LEMON} graph library. \section{Problem Statement}\label{sec:ProbStat} @@ -321,37 +317,30 @@ \subsection{Common problems}\label{sec:CommProb} -The focus of this paper is on two extensively studied topics, the -subgraph isomorphism and its variations. However, the following -problems also appear in many applications. +The focus of this paper is on the following problems appearing in many applications. -The \textbf{subgraph matching problem} is the following: is +The \textbf{subgraph isomorphism problem} is the following: is $G_{1}$ isomorphic to any subgraph of $G_{2}$ by a given node label? -The \textbf{induced subgraph matching problem} asks the same about the +The \textbf{induced subgraph isomorphism problem} asks the same about the existence of an induced subgraph. The \textbf{graph isomorphism problem} can be defined as induced subgraph matching problem where the sizes of the two graphs are equal. -In addition, one may want to find a \textbf{single} mapping or \textbf{enumerate} all of them. - -Note that some authors refer to the term -\emph{subgraph isomorphism problem} as an \emph{induced subgraph - isomorphism problem}. +In addition, one may want to find a \textbf{single} embedding or \textbf{enumerate} all of them. \section{The VF2 Algorithm}\label{sec:VF2Alg} -This algorithm is the basis of both the VF2++ and the VF2 Plus. VF2 -is able to handle all the variations mentioned in Section - \ref{sec:CommProb}. Although it can also handle directed graphs, +This algorithm is the basis of both the VF2++ and the VF2~Plus. VF2 +is able to handle all the variations mentioned in Section~\ref{sec:CommProb}. Although it can also handle directed graphs, for the sake of simplicity, only the undirected case will be discussed. \subsection{Common notations} \indent Assume $G_{1}$ is searched in $G_{2}$. The following -definitions and notations will be used throughout the whole paper. +definitions and notations will be used throughout this paper. \begin{definition} An injection $\mathfrak{m} : D \longrightarrow V_2$ is called (partial) \textbf{mapping}, where $D\subseteq V_1$. \end{definition} @@ -370,29 +359,29 @@ \end{definition} \begin{definition} -Let \textbf{extend}$(\mathfrak{m},(u,v))$ denote the function $f : \mathfrak{D}(\mathfrak{m})\cup\{u\}\longrightarrow\mathfrak{R}(\mathfrak{m})\cup\{v\}$, for which $\forall w\in \mathfrak{D}(\mathfrak{m}) : \mathfrak{m}(w)=f(w)$ and $f(u)=v$ holds. Where $u\in V_1\setminus\mathfrak{D}(\mathfrak{m})$ and $v\in V_2\setminus\mathfrak{R}(\mathfrak{m})$, otherwise $extend(\mathfrak{m},(u,v))$ is undefined. +Let \textbf{extend}$(\mathfrak{m},(u,v))$ denote the function $f : \mathfrak{D}(\mathfrak{m})\cup\{u\}\longrightarrow\mathfrak{R}(\mathfrak{m})\cup\{v\}$, for which $\forall w\in \mathfrak{D}(\mathfrak{m}) : f(w)=\mathfrak{m}(w)$ and $f(u)=v$ holds, where $u\in V_1\setminus\mathfrak{D}(\mathfrak{m})$ and $v\in V_2\setminus\mathfrak{R}(\mathfrak{m})$; otherwise $extend(\mathfrak{m},(u,v))$ is undefined. \end{definition} \begin{notation} Throughout the paper, $\mathbf{PT}$ denotes a generic problem type -which can be substituted by any of the $\mathbf{ISO}$, $\mathbf{SUB}$ -and $\mathbf{IND}$ problems. +which can be substituted by any of the $\mathbf{SUB}$, $\mathbf{IND}$ +and $\mathbf{ISO}$ problems, which stand for the the problems mentioned in Section~\ref{sec:CommProb} respectively. \end{notation} \begin{definition} -Let $\mathfrak{m}$ be a mapping. A logical function $\mathbf{Cons_{PT}}$ is a -\textbf{consistency function by } $\mathbf{PT}$ if the following -holds. If there exists a whole mapping $w$ satisfying the requirements of $PT$, for which $\mathfrak{m}$ is exactly $w$ restricted to $\mathfrak{D}(\mathfrak{m})$. +Let $\mathfrak{m}$ be a mapping. The \textbf{consistency function for } $\mathbf{PT}$ is a logical function, for which $\mathbf{Cons_{PT}}(\mathfrak{m})$ is true if and only if $\mathfrak{m}$ satisfies the requirements of $\mathbf{PT}$ considering the subgraphs of $G_{1}$ and $G_{2}$ induced by $\mathfrak{D}(\mathfrak{m})$ and $\mathfrak{R}(\mathfrak{m})$, respectively. + +%$\mathbf{Cons_{PT}}(\mathfrak{m})$ is true if there exists a whole mapping $w$ satisfying the requirements of $PT$, for which $\mathfrak{m}$ is exactly $w$ restricted to $\mathfrak{D}(\mathfrak{m})$. \end{definition} \begin{definition} Let $\mathfrak{m}$ be a mapping. A logical function $\mathbf{Cut_{PT}}$ is a -\textbf{cutting function by } $\mathbf{PT}$ if the following +\textbf{cutting function for } $\mathbf{PT}$ if the following holds. $\mathbf{Cut_{PT}(\mathfrak{m})}$ is false if there exists a sequence of extend operations, which results in a whole mapping satisfying the requirements of $PT$. \end{definition} \begin{definition} -$\mathfrak{m}$ is said to be \textbf{consistent mapping by} $\mathbf{PT}$ if +A mapping $\mathfrak{m}$ is said to be \textbf{consistent mapping by} $\mathbf{PT}$ if $Cons_{PT}(\mathfrak{m})$ is true. \end{definition} @@ -408,12 +397,11 @@ no whole consistent mapping can contain the current mapping. \subsection{Overview of the algorithm} -VF2 uses a state space representation of mappings, $Cons_{PT}$ for -excluding inconsistency with the problem type and $Cut_{PT}$ for -pruning the search tree. -Algorithm~\ref{alg:VF2Pseu} is a high level description of -the VF2 matching algorithm. Each state of the matching process can +VF2 begins with an empty mapping and gradually extends it with respect to the consistency and cutting functions until a whole mapping is reached. + +Algorithm~\ref{alg:VF2Pseu} is a high-level description of +the VF2 algorithm. Each state of the matching process can be associated with a mapping $\mathfrak{m}$. The initial state is associated with a mapping $\mathfrak{m}$, for which $\mathfrak{D}(\mathfrak{m})=\emptyset$, i.e. it starts with an empty mapping. @@ -430,8 +418,7 @@ \If{$\mathfrak{m}$ covers $V_{1}$} \State Output($\mathfrak{m}$) \Else - \State Compute the set $P_\mathfrak{m}$ of the pairs candidate for inclusion - in $\mathfrak{m}$ \ForAll{$p\in{P_\mathfrak{m}}$} \If{Cons$_{PT}$($p,\mathfrak{m}$) $\wedge$ + \State Compute the set $P_\mathfrak{m}$ of the candidate pairs for extending $\mathfrak{m}$ \ForAll{$p\in{P_\mathfrak{m}}$} \If{Cons$_{PT}$($p,\mathfrak{m}$) $\wedge$ $\neg$Cut$_{PT}$($p,\mathfrak{m}$)} \State \textbf{call} VF2($extend(\mathfrak{m},p)$, $PT$) \EndIf \EndFor \EndIf \EndProcedure @@ -440,7 +427,7 @@ For the current mapping $\mathfrak{m}$, the algorithm computes $P_\mathfrak{m}$, the set of -candidate node pairs for adding to the current mapping $\mathfrak{m}_s$. +candidate node pairs for extending the current mapping $\mathfrak{m}$. For each pair $p$ in $P_\mathfrak{m}$, $Cons_{PT}(p,\mathfrak{m})$ and $Cut_{PT}(p,\mathfrak{m})$ are evaluated. If the former is true and @@ -448,8 +435,8 @@ $extend(\mathfrak{m},p)$. Otherwise, $extend(\mathfrak{m},p)$ is not consistent by $PT$, or it can be proved that $\mathfrak{m}$ can not be extended to a whole mapping. -In order to make sure of the correctness, see -\begin{claim} +In order to make sure of the correctness, see Claim~\ref{claim:consMapps}. +\begin{claim}\label{claim:consMapps} Through consistent mappings, only consistent whole mappings can be reached, and all the consistent whole mappings are reachable through consistent mappings. @@ -458,20 +445,30 @@ Note that a mapping may be reached in exponentially many different ways, since the order of extensions does not influence the nascent mapping. -However, one may observe +However, one may make the following observations. + +%\begin{claim} +%\label{claim:claimTotOrd} +%Let $\prec$ be an arbitrary total ordering relation on $V_{1}$. If +%the algorithm ignores each $p=(u,v) \in P_\mathfrak{m}$, for which +%\begin{center} +%$\exists (\tilde{u},\tilde{v})\in P_\mathfrak{m}: \tilde{u} \prec u$, +%\end{center} +%then no mapping can be reached more than once, and each whole mapping %remains reachable. +%\end{claim} + +\begin{definition} +A total order $(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(|V_{1}|)})$ of +$V_{1}$ is \textbf{matching order} if VF2 can cover $u_{\sigma(d)}$ on the $d$-th level for all $d\in\{1,..,|V_{1}|\}$. +\end{definition} \begin{claim} \label{claim:claimTotOrd} -Let $\prec$ be an arbitrary total ordering relation on $V_{1}$. If -the algorithm ignores each $p=(u,v) \in P_\mathfrak{m}$, for which -\begin{center} -$\exists (\tilde{u},\tilde{v})\in P_\mathfrak{m}: \tilde{u} \prec u$, -\end{center} -then no mapping can be reached more than once, and each whole mapping remains reachable. +If VF2 is prescribed to cover the nodes of $G_1$ according to a matching order, then no mapping can be reached more than once and each whole mapping remains reachable. \end{claim} -Note that the cornerstone of the improvements to VF2 is a proper -choice of a total ordering. +Note that the cornerstone of the improvements to VF2 is to choose a proper +matching order. \subsection{The candidate set} \label{candidateComputingVF2} @@ -482,7 +479,7 @@ $\mathbf{T_{2}(\mathfrak{m})} := \{v \in V_{2}\backslash\mathfrak{R}(\mathfrak{m}) : \exists\tilde{v}\in{\mathfrak{R}(\mathfrak{m}):(v,\tilde{v})\in E_{2}}\}$. \end{notation} -The set $P_\mathfrak{m}$ includes the pairs of uncovered neighbours of covered +The set $P_\mathfrak{m}$ contains the pairs of uncovered neighbours of covered nodes, and if there is not such a node pair, all the pairs containing two uncovered nodes are added. Formally, let \[ @@ -496,27 +493,28 @@ \] \subsection{Consistency} -Suppose $p=(u,v)$, where $u\in V_{1}$ and $v\in V_{2}$, $\mathfrak{m}$ is a consistent mapping by +Let $p=(u,v)\in V_{1}\times V_{2}$, and suppose $\mathfrak{m}$ is a consistent mapping by $PT$. $Cons_{PT}(p,\mathfrak{m})$ checks whether -including pair $p$ into $\mathfrak{m}$ leads to a consistent mapping by $PT$. +adding pair $p$ into $\mathfrak{m}$ leads to a consistent mapping by $PT$. -For example, the consistency function of induced subgraph isomorphism is as follows. +For example, the consistency function of the induced subgraph isomorphism problem is the following. \begin{notation} Let $\mathbf{\Gamma_{1} (u)}:=\{\tilde{u}\in V_{1} : (u,\tilde{u})\in E_{1}\}$, and $\mathbf{\Gamma_{2} - (v)}:=\{\tilde{v}\in V_{2} : (v,\tilde{v})\in E_{2}\}$, where $u\in V_{1}$ and $v\in V_{2}$. + (v)}:=\{\tilde{v}\in V_{2} : (v,\tilde{v})\in E_{2}\}$, where $u\in V_{1}$ and $v\in V_{2}$. That is, $\mathbf{\Gamma_{i} (w)}$ denotes the set of neighbours of node $w$ in $G_i$ $(i=1,2)$. \end{notation} -$extend(\mathfrak{m},(u,v))$ is a consistent mapping by $IND$ $\Leftrightarrow -(\forall \tilde{u}\in \mathfrak{D}(\mathfrak{m}): (u,\tilde{u})\in E_{1} -\Leftrightarrow (v,\mathfrak{m}(\tilde{u}))\in E_{2})$. The -following formulation gives an efficient way of calculating -$Cons_{IND}$. \begin{claim} -$Cons_{IND}((u,v),\mathfrak{m}):=\mathcal{L}(u)\!\!=\!\!\mathcal{L}(v)\wedge(\forall \tilde{v}\in \Gamma_{2}(v)\cap\mathfrak{R}(\mathfrak{m}):(u,\mathfrak{m}^{-1}(\tilde{v}))\in E_{1})\wedge +$extend(\mathfrak{m},(u,v))$ is a consistent mapping by $IND$ if and only if $\mathfrak{m}$ is consistent and $(\forall \tilde{u}\in \mathfrak{D}(\mathfrak{m}): (u,\tilde{u})\in E_{1} +\Leftrightarrow (v,\mathfrak{m}(\tilde{u}))\in E_{2})$. +\end{claim} + +The following formulation gives an efficient way of calculating $Cons_{IND}$. + +\begin{claim} +$Cons_{IND}((u,v),\mathfrak{m}):=Cons_{IND}(\mathfrak{m})\wedge\mathcal{L}(u)\!\!=\!\!\mathcal{L}(v)\wedge(\forall \tilde{v}\in \Gamma_{2}(v)\cap\mathfrak{R}(\mathfrak{m}):(u,\mathfrak{m}^{-1}(\tilde{v}))\in E_{1})\wedge (\forall \tilde{u}\in \Gamma_{1}(u) - \cap \mathfrak{D}(\mathfrak{m}):(v,\mathfrak{m}(\tilde{u}))\in E_{2})$ is a - consistency function in the case of $IND$. + \cap \mathfrak{D}(\mathfrak{m}):(v,\mathfrak{m}(\tilde{u}))\in E_{2})$ is the consistency function for $IND$. \end{claim} \subsection{Cutting rules} @@ -525,7 +523,7 @@ be true only if it is impossible to extend $extend(\mathfrak{m},p)$ to a whole mapping. -As an example, the cutting function of induced subgraph isomorphism is presented. +As an example, a cutting function of induced subgraph isomorphism problem is presented. \begin{notation} Let $\mathbf{\tilde{T}_{1}}(\mathfrak{m}):=(V_{1}\backslash \mathfrak{D}(\mathfrak{m}))\backslash T_{1}(\mathfrak{m})$, and @@ -537,28 +535,28 @@ $Cut_{IND}((u,v),\mathfrak{m}):= |\Gamma_{2} (v)\ \cap\ T_{2}(\mathfrak{m})| < |\Gamma_{1} (u)\ \cap\ T_{1}(\mathfrak{m})| \vee |\Gamma_{2}(v)\cap \tilde{T}_{2}(\mathfrak{m})| < |\Gamma_{1}(u)\cap - \tilde{T}_{1}(\mathfrak{m})|$ is a cutting function by $IND$. + \tilde{T}_{1}(\mathfrak{m})|$ is a cutting function for $IND$. \end{claim} \section{The VF2++ Algorithm}\label{sec:VF2ppAlg} -Although any total ordering relation makes the search space of VF2 a +Although any matching order makes the search space of VF2 a tree, its choice turns out to dramatically influence the number of visited states. The goal is to determine an efficient one as quickly as possible. -The main reason for VF2++' superiority over VF2 is twofold. Firstly, -taking into account the structure and the node labeling of the graph, -VF2++ determines a state order in which most of the unfruitful +The main reason for the superiority of VF2++ over VF2 is twofold. Firstly, +taking into account the structure and the node labelling of the graph, +VF2++ determines a matching order in which most of the unfruitful branches of the search space can be pruned immediately. Secondly, introducing more efficient --- nevertheless still easier to compute --- cutting rules reduces the chance of going astray even further. -In addition to the usual subgraph isomorphism, specialized versions -for induced subgraph isomorphism and for graph isomorphism have been +In addition to the usual subgraph isomorphism problem, specialized versions +for induced subgraph and graph isomorphism problems have been designed. -Note that a weaker version of the cutting rules and an efficient -candidate set calculating were described in \cite{VF2Plus}. +Note that a weaker version of the cutting rules of VF2++ and an efficient +candidate set calculation method were described in~\cite{VF2Plus}. It should be noted that all the methods described in this section are extendable to handle directed graphs and edge labels as well. @@ -570,7 +568,7 @@ highest levels and goes deep only if it is needed. \begin{notation} -Let $\mathbf{Conn_{H}(u)}:=|\Gamma_{1}(u)\cap H\}|$, that is the +Let $\mathbf{Conn_{H}(u)}:=|\Gamma_{1}(u)\cap H|$, that is the number of neighbours of u which are in H, where $u\in V_{1} $ and $H\subseteq V_{1}$. \end{notation} @@ -582,22 +580,20 @@ ---, the more rarely satisfiable consistency constraints for its pair are given. -In biology, most of the graphs are sparse, thus several nodes in +Most of the graphs of biological and chemical structures are sparse, thus several nodes in $T_{1}(\mathfrak{m})$ may have the same $Conn_{\mathfrak{D}(\mathfrak{m})}$, which makes reasonable to define a secondary and a tertiary order between them. The observation above proves itself to be as determining, that the secondary ordering prefers nodes with the most uncovered neighbours among which have the same $Conn_{\mathfrak{D}(\mathfrak{m})}$ to increase -$Conn_{\mathfrak{D}(\mathfrak{m})}$ of uncovered nodes so much, as possible. The -tertiary ordering prefers nodes having the rarest uncovered labels. +$Conn_{\mathfrak{D}(\mathfrak{m})}$ of uncovered nodes as much, as possible. The tertiary ordering prefers nodes having the rarest uncovered labels in $G_2$. -Note that the secondary ordering is the same as the ordering by $deg$, +Note that the secondary ordering is the same as ordering by degrees, which is a static data in front of the above used. These rules can easily result in a matching order which contains the nodes of a long path successively, whose nodes may have low $Conn$ and -is easily matchable into $G_{2}$. To avoid that, a BFS order is -used, which provides the shortest possible paths. +is easily matchable into $G_{2}$. To try to avoid that, a Breadth-first-search order is used, and on each of its levels, the ordering procedure described above is applied. \newline In the following, some examples on which the VF2 may be slow are @@ -617,22 +613,20 @@ \newline $\mathcal{L}(\tilde{v}):=red \ \forall \tilde{v}\in V_{2}\backslash \{v\}$ -\newline Now, any mapping by $\mathcal{L}$ must contain $(u,v)$, since $u$ is black and no node in $V_{2}$ has a black label except $v$. If unfortunately $u$ were the last node which will get covered, VF2 would check only in the last steps, whether $u$ can be matched to $v$. -\newline + However, had $u$ been the first matched node, u would have been matched immediately to v, so all the mappings would have been precluded in which node labels can not correspond. \end{example} \begin{example} -Suppose there is no node label given, $G_{1}$ is a small graph and -can not be mapped into $G_{2}$ and $u\in V_{1}$. +Suppose there is no node label given, and $G_{1}$ is a small graph that can not be mapped into $G_{2}$ and $u\in V_{1}$. \newline Let $G'_{1}:=(V_{1}\cup \{u'_{1},u'_{2},..,u'_{k}\},E_{1}\cup @@ -640,10 +634,7 @@ $G'_{1}$ is $G_{1}\cup \{ a\ k$ long path, which is disjoint from $G_{1}$ and one of its starting points is connected to $u\in V_{1}\}$. -\newline -Is there a subgraph of $G_{2}$, which is isomorph with -$G'_{1}$? -\newline + If unfortunately the nodes of the path were the first $k$ nodes in the matching order, the algorithm would iterate through all the possible k long paths in $G_{2}$, and it would recognize that no path can be @@ -656,45 +647,48 @@ These examples may look artificial, but the same problems also appear in real-world instances, even though in a less obvious way. -\subsection{Preparations} -\begin{claim} -\label{claim:claimCoverFromLeft} -The total ordering relation uniquely determines a node order, in which -the nodes of $V_{1}$ will be covered by VF2. From the point of -view of the matching procedure, this means, that always the same node -of $G_{1}$ will be covered on the d-th level. -\end{claim} +%\subsection{Preparations} +%\begin{claim} +%\label{claim:claimCoverFromLeft} +%The total ordering relation uniquely determines a node order, in which +%the nodes of $V_{1}$ will be covered by VF2. From the point of +%view of the matching procedure, this means, that always the same node +%of $G_{1}$ will be covered on the $d$-th level. +%\end{claim} -\begin{definition} -An order $(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(|V_{1}|)})$ of -$V_{1}$ is \textbf{matching order} if exists $\prec$ total -ordering relation, s.t. the VF2 with $\prec$ on the d-th level finds -pair for $u_{\sigma(d)}$ for all $d\in\{1,..,|V_{1}|\}$. -\end{definition} +%\begin{definition} +%An order $(u_{\sigma(1)},u_{\sigma(2)},..,u_{\sigma(|V_{1}|)})$ of +%$V_{1}$ is \textbf{matching order} if there exists $\prec$ total +%ordering relation, s.t. the VF2 with $\prec$ on the d-th level finds +%pair for $u_{\sigma(d)}$ for all $d\in\{1,..,|V_{1}|\}$. +%\end{definition} -\begin{claim}\label{claim:MOclaim} -A total ordering is matching order iff the nodes of every component -form an interval in the node sequence, and every node connects to a -previous node in its component except the first node of each component. -\end{claim} +%\begin{claim}\label{claim:MOclaim} +%A total ordering is matching order iff the nodes of every component +%form an interval in the node sequence, and every node connects to a +%previous node in its component except the first node of each component. +%\end{claim} -To summing up, a total ordering always uniquely determines a matching -order, and every matching order can be determined by a total ordering, -however, more than one different total orderings may determine the -same matching order. +%In summary, a total ordering always uniquely determines a matching +%order, and every matching order can be determined by a total ordering, +%however, more than one different total orderings may determine the +%same matching order. -\subsection{Total ordering} -The matching order will be searched directly. +\subsection{Matching order} \begin{notation} Let \textbf{F$_\mathcal{M}$(l)}$:=|\{v\in V_{2} : -l=\mathcal{L}(v)\}|-|\{u\in V_{1}\backslash \mathcal{M} : l=\mathcal{L}(u)\}|$ , +l=\mathcal{L}(v)\}|-|\{u\in \mathcal{M} : l=\mathcal{L}(u)\}|$, where $l$ is a label and $\mathcal{M}\subseteq V_{1}$. \end{notation} -\begin{definition}Let $\mathbf{arg\ max}_{f}(S) :=\{u\in S : f(u)=max_{v\in S}\{f(v)\}\}$ and $\mathbf{arg\ min}_{f}(S) := arg\ max_{-f}(S)$, where $S$ is a finite set and $f:S\longrightarrow \mathbb{R}$. +\begin{definition}Let $\mathbf{arg\ max}_{f}(S) :=\{u\in S : f(u)=max_{v\in S}\{f(v)\}\}$ and $\mathbf{arg\ min}_{f}(S) := arg\ max_{(-f)}(S)$, where $S$ is a finite set and $f:S\longrightarrow \mathbb{R}$. \end{definition} -\begin{algorithm} +\begin{notation} +Let $deg(v)$ denote the degree of node $v$. +\end{notation} + +\begin{algorithm}[H] \algtext*{EndIf} \algtext*{EndProcedure} \algtext*{EndWhile} @@ -707,8 +701,7 @@ min$_{F_\mathcal{M}\circ \mathcal{L}}(V_{1}\backslash \mathcal{M})$)\label{alg:findMin} \State Compute $T$, a BFS tree with root node $r$. \For{$d=0,1,...,depth(T)$} \State $V_d$:=nodes of the -$d$-th level \State Process $V_d$ \Comment{See Algorithm - \ref{alg:VF2PPProcess1}} \EndFor +$d$-th level \State Process $V_d$ \Comment{See Algorithm~\ref{alg:VF2PPProcess1}} \EndFor \EndWhile \EndProcedure \end{algorithmic} \end{algorithm} @@ -720,23 +713,23 @@ \caption{\hspace{.5cm}$The\ method\ for\ processing\ a\ level\ of\ the\ BFS\ tree$}\label{alg:VF2PPProcess1} \begin{algorithmic}[1] \Procedure{VF2++ProcessLevel}{$V_{d}$} \While{$V_d\neq\emptyset$} -\State $m\in$ arg min$_{F_{\mathcal{M}\circ\ \mathcal{L}}}($ arg max$_{deg}($arg +\State $m\in$ arg min$_{F_\mathcal{M}\circ\ \mathcal{L}}($ arg max$_{deg}($arg max$_{Conn_{\mathcal{M}}}(V_{d})))$ \State $V_d:=V_d\backslash m$ \State Append node $m$ to the end of $\mathcal{M}$ \State Refresh $F_\mathcal{M}$ \EndWhile \EndProcedure \end{algorithmic} \end{algorithm} -Algorithm~\ref{alg:VF2PPPseu} is a high level description of the +Algorithm~\ref{alg:VF2PPPseu} is a high-level description of the matching order procedure of VF2++. It computes a BFS tree for each component in ascending order of their rarest node labels and largest $deg$, -whose root vertex is the component's minimal -node. Algorithm~\ref{alg:VF2PPProcess1} is a method to process a level of the BFS tree, which appends the nodes of the current level in descending +whose root vertex is the minimal node of its component. Algorithm~\ref{alg:VF2PPProcess1} is a method to process a level of the BFS tree, which appends the nodes of the current level in descending lexicographic order by $(Conn_{\mathcal{M}},deg,-F_\mathcal{M})$ separately to $\mathcal{M}$, and refreshes $F_\mathcal{M}$ immediately. -Claim~\ref{claim:MOclaim} shows that Algorithm~\ref{alg:VF2PPPseu} -provides a matching order. +\begin{claim} +Algorithm~\ref{alg:VF2PPPseu} provides a matching order. +\end{claim} \subsection{Cutting rules} @@ -750,18 +743,16 @@ V_{2}$ and $l$ is a label. \end{notation} -\subsubsection{Induced subgraph isomorphism} -\begin{claim} -\[LabCut_{IND}((u,v),\mathfrak{m}):=\bigvee_{l\ is\ label}|\Gamma_{2}^{l} (v) \cap T_{2}(\mathfrak{m})|\!<\!|\Gamma_{1}^{l}(u)\cap T_{1}(\mathfrak{m})|\ \vee\]\[\bigvee_{l\ is\ label} \newline |\Gamma_{2}^{l}(v)\cap \tilde{T}_{2}(\mathfrak{m})| < |\Gamma_{1}^{l}(u)\cap \tilde{T}_{1}(\mathfrak{m})|\] is a cutting function by IND. -\end{claim} -\subsubsection{Graph isomorphism} -\begin{claim} -\[LabCut_{ISO}((u,v),\mathfrak{m}):=\bigvee_{l\ is\ label}|\Gamma_{2}^{l} (v) \cap T_{2}(\mathfrak{m})|\!\neq\!|\Gamma_{1}^{l}(u)\cap T_{1}(\mathfrak{m})|\ \vee\]\[\bigvee_{l\ is\ label} \newline |\Gamma_{2}^{l}(v)\cap \tilde{T}_{2}(\mathfrak{m})| \neq |\Gamma_{1}^{l}(u)\cap \tilde{T}_{1}(\mathfrak{m})|\] is a cutting function by ISO. +\begin{claim}[Cutting function for ISO] +\[LabCut_{ISO}((u,v),\mathfrak{m}):=\bigvee_{l\ is\ label}|\Gamma_{2}^{l} (v) \cap T_{2}(\mathfrak{m})|\!\neq\!|\Gamma_{1}^{l}(u)\cap T_{1}(\mathfrak{m})|\ \vee\]\[\bigvee_{l\ is\ label} \newline |\Gamma_{2}^{l}(v)\cap \tilde{T}_{2}(\mathfrak{m})| \neq |\Gamma_{1}^{l}(u)\cap \tilde{T}_{1}(\mathfrak{m})|\] is a cutting function for ISO. \end{claim} -\subsubsection{Subgraph isomorphism} -\begin{claim} -\[LabCut_{SU\!B}((u,v),\mathfrak{m}):=\bigvee_{l\ is\ label}|\Gamma_{2}^{l} (v) \cap T_{2}(\mathfrak{m})|\!<\!|\Gamma_{1}^{l}(u)\cap T_{1}(\mathfrak{m})|\] is a cutting function by SUB. +\begin{claim}[Cutting function for IND] +\[LabCut_{IND}((u,v),\mathfrak{m}):=\bigvee_{l\ is\ label}|\Gamma_{2}^{l} (v) \cap T_{2}(\mathfrak{m})|\!<\!|\Gamma_{1}^{l}(u)\cap T_{1}(\mathfrak{m})|\ \vee\]\[\bigvee_{l\ is\ label} \newline |\Gamma_{2}^{l}(v)\cap \tilde{T}_{2}(\mathfrak{m})| < |\Gamma_{1}^{l}(u)\cap \tilde{T}_{1}(\mathfrak{m})|\] is a cutting function for IND. +\end{claim} + +\begin{claim}[Cutting function for SUB] +\[LabCut_{SU\!B}((u,v),\mathfrak{m}):=\bigvee_{l\ is\ label}|\Gamma_{2}^{l} (v) \cap T_{2}(\mathfrak{m})|\!<\!|\Gamma_{1}^{l}(u)\cap T_{1}(\mathfrak{m})|\] is a cutting function for SUB. \end{claim} @@ -769,9 +760,12 @@ \section{Implementation details}\label{sec:VF2ppImpl} This section provides a detailed summary of an efficient implementation of VF2++. +\begin{notation} +Let $\Delta_1$ and $\Delta_2$ denote the largest degree in $G_1$ and $G_2$, respectively, and let $\Delta=\max\{\Delta_1,\Delta_2\}$. +\end{notation} \subsection{Storing a mapping} After fixing an arbitrary node order ($u_0, u_1, .., -u_{|G_{1}|-1}$) of $G_{1}$, an array $M$ is usable to store +u_{|V_{1}|-1}$) of $G_{1}$, an array $M$ can be used to store the current mapping in the following way. \[ M[i] = @@ -780,13 +774,13 @@ if\ no\ node\ has\ been\ mapped\ to\ u_i, \end{cases} \] -where $i\in\{0,1, ..,|G_{1}|-1\}$, $v\in V_{2}$ and $INV\!ALI\!D$ +where $i\in\{0,1, ..,|V_{1}|-1\}$, $v\in V_{2}$ and $INV\!ALI\!D$ means "no node". \subsection{Avoiding the recurrence} The recursion of Algorithm~\ref{alg:VF2Pseu} can be realized as a \textit{while loop}, which has a loop counter $depth$ denoting the -all-time depth of the recursion. Fixing a matching order, let $M$ -denote the array storing the all-time mapping. Based on Claim~\ref{claim:claimCoverFromLeft}, +current depth of the recursion. Fixing a matching order, let $M$ +denote the array storing the current mapping. Observe that $M$ is $INV\!ALI\!D$ from index $depth$+1 and not $INV\!ALI\!D$ before $depth$. $M[depth]$ changes while the state is being processed, but the property is held before @@ -795,22 +789,21 @@ The necessary part of the candidate set is easily maintainable or computable by following -Section~\ref{candidateComputingVF2}. A much faster method -has been designed for biological- and sparse graphs, see the next +the steps described in Section~\ref{candidateComputingVF2}. A much faster method +has been designed for biological and sparse graphs, see the next section for details. \subsection{Calculating the candidates for a node} -Being aware of Claim~\ref{claim:claimCoverFromLeft}, the -task is not to maintain the candidate set, but to generate the +The task is not to maintain the candidate set, but to generate the candidate nodes in $G_{2}$ for a given node $u\in V_{1}$. In case of any of the three problem types and a mapping $\mathfrak{m}$, if a node $v\in V_{2}$ is a potential pair of $u\in V_{1}$, then $\forall u'\in \mathfrak{D}(\mathfrak{m}) : (u,u')\in E_{1}\Rightarrow (v,\mathfrak{m}(u'))\in E_{2}$. That is, each covered neighbour of $u$ has to be mapped to -a covered neighbour of $v$. +a covered neighbour of $v$, i.e. selecting arbitrarily a covered neighbour $u'$ of $u$, all of the admissible candidates for $u$ are among the neighbours of $\mathfrak{m}(u')$. -Having said that, an algorithm running in $\Theta(deg)$ time is +Having said that, an algorithm running in $\Theta(\Delta_2)$ time is describable if there exists a covered node in the component containing $u$, and a linear one otherwise. @@ -823,18 +816,17 @@ numbers $\{0,1,..,|K|-1\}$, where $K$ is the set of the labels. It enables $F_\mathcal{M}$ to be stored in an array. At first, the node order $\mathcal{M}=\emptyset$, so $F_\mathcal{M}[i]$ is the number of nodes -in $V_{1}$ having label $i$, which is easy to compute in -$\Theta(|V_{1}|)$ steps. +in $V_{2}$ having label $i$, which is easy to compute in +$\Theta(|V_{2}|)$ steps. Representing $\mathcal{M}\subseteq V_{1}$ as an array of -size $|V_{1}|$, both the computation of the BFS tree, and processing its levels by Algorithm~\ref{alg:VF2PPProcess1} can be done inplace by swapping nodes. +size $|V_{1}|$, both the computation of the BFS tree, and processing its levels by Algorithm~\ref{alg:VF2PPProcess1} can be done in-place by swapping nodes. \subsection{Cutting rules} In Section~\ref{VF2PPCuttingRules}, the cutting rules were described using the sets $T_{1}$, $T_{2}$, $\tilde T_{1}$ -and $\tilde T_{2}$, which are dependent on the all-time mapping -(i.e. on the all-time state). The aim is to check the labeled cutting -rules of VF2++ in $\Theta(deg)$ time. +and $\tilde T_{2}$, which are dependent on the current mapping. The aim is to check the labelled cutting +rules of VF2++ in $\Theta(\Delta)$ time. Firstly, suppose that these four sets are given in such a way, that checking whether a node is in a certain set takes constant time, @@ -842,41 +834,37 @@ initially zero integer lookup table of size $|K|$. After incrementing $L[\mathcal{L}(u')]$ for all $u'\in \Gamma_{1}(u) \cap T_{1}(\mathfrak{m})$ and decrementing $L[\mathcal{L}(v')]$ for all $v'\in\Gamma_{2} (v) \cap -T_{2}(s)$, the first part of the cutting rules is checkable in -$\Theta(deg)$ time by considering the proper signs of $L$. Setting $L$ -to zero takes $\Theta(deg)$ time again, which makes it possible to use +T_{2}(\mathfrak{m})$, the first part of the cutting rules is checkable in +$\Theta(\Delta)$ time by considering the proper signs of $L$. Setting $L$ +to zero takes $\Theta(\Delta)$ time again, which makes it possible to use the same table through the whole algorithm. The second part of the cutting rules can be verified using the same method with $\tilde T_{1}$ and $\tilde T_{2}$ instead of $T_{1}$ and -$T_{2}$. Thus, the overall complexity is $\Theta(deg)$. +$T_{2}$. Thus, the overall time complexity is $\Theta(\Delta)$. -Another integer lookup table storing the number of covered neighbours -of each node in $G_{2}$ gives all the information about the sets -$T_{2}$ and $\tilde T_{2}$, which is maintainable in -$\Theta(deg)$ time when a pair is added or substracted by incrementing +To maintain the sets $T_{1}$, $T_{2}$, $\tilde T_{1}$ +and $\tilde T_{2}$, two other integer lookup tables storing the number of covered neighbours of the nodes of the two graphs can be used. This representation allows constant-time membership checking, furthermore it is maintainable in $\Theta(\Delta)$ time whenever a node pair is added or subtracted by incrementing or decrementing the proper indices. A further improvement is that the values of $L[\mathcal{L}(u')]$ in case of checking $u$ are dependent only on -$u$, i.e. on the size of the mapping, so for each $u\in V_{1}$ an -array of pairs (label, number of such labels) can be stored to skip -the maintaining operations. Note that these arrays are at most of size -$deg$. +$u$, i.e. on the current depth of the recursion, so for each $u\in V_{1}$, an array of pairs \textit{(label, number of such labels)} can store $L$. Note that these arrays are at most of size +$\Delta_1$ if pairs with non-appearing node labels are discarded. Using similar techniques, the consistency function can be evaluated in -$\Theta(deg)$ steps, as well. +$\Theta(\Delta)$ steps, as well. \section{Experimental results}\label{sec:ExpRes} -This section compares the performance of VF2++ and VF2 Plus. According to +This section compares the performance of VF2++ and VF2~Plus. According to our experience, both algorithms run faster than VF2 with orders of magnitude, thus its inclusion was not reasonable. -The algorithms were implemented in C++ using the open source -LEMON graph and network optimization library\cite{LEMON}. The test were carried out on a linux based system with an Intel i7 X980 3.33 GHz CPU and 6 GB of RAM. +The algorithms were implemented in C++ using the open-source +LEMON graph and network optimization library~\cite{LEMON}. The tests were carried out on a Linux-based system with an Intel i7 X980 3.33 GHz CPU and 6 GB of RAM. \subsection{Biological graphs} The tests have been executed on a recent biological dataset created for the International Contest on Pattern Search in Biological -Databases\cite{Content}, which has been constructed of molecule, +Databases~\cite{Content}, which has been constructed of molecule, protein and contact map graphs extracted from the Protein Data -Bank\cite{ProteinDataBank}. +Bank~\cite{ProteinDataBank}. The molecule dataset contains small graphs with less than 100 nodes and an average degree of less than 3. The protein dataset contains @@ -884,14 +872,13 @@ contact map dataset contains graphs with 150-800 nodes and an average degree of 20. \\ -In the following, both the induced subgraph isomorphism and the graph -isomorphism will be examined. - +In the following, both the induced subgraph and the graph +isomorphism problems will be examined. This dataset provides graph pairs, between which all the induced subgraph isomorphisms have to be found. For runtime results, please see Figure~\ref{fig:bioIND}. In an other experiment, the nodes of each graph in the database had been shuffled, and an isomorphism between the shuffled and the original -graph was searched. The solution times are shown on Figure~\ref{fig:bioISO}. +graph was searched. The running times are shown on Figure~\ref{fig:bioISO}. \begin{figure}[H] @@ -900,10 +887,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{figure}[H] \begin{tikzpicture}[trim axis left, trim axis right] -\begin{axis}[title=Molecules IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title=Molecules IND,xlabel={$|V_2|$},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \thinspace}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {Orig/Molecules.32.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table {VF2PPLabel/Molecules.32.txt}; @@ -918,10 +906,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{figure}[H] \begin{tikzpicture}[trim axis left, trim axis right] -\begin{axis}[title=Contact maps IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title=Contact maps IND,xlabel={$|V_2|$},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \thinspace}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {Orig/ContactMaps.128.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -929,7 +918,7 @@ \end{axis} \end{tikzpicture} \caption{On contact maps, VF2++ runs almost in constant time, while VF2 - Plus has a near linear behaviour.} \label{fig:INDContact} + Plus has a near-linear behaviour.} \label{fig:INDContact} \end{figure} \end{subfigure} @@ -938,23 +927,24 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{figure}[H] \begin{tikzpicture}[trim axis left, trim axis right] - \begin{axis}[title=Proteins IND,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid + \begin{axis}[title=Proteins IND,xlabel={$|V_2|$},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \thinspace}] %\addplot+[only marks] table + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot[mark=*,mark size=1.2pt,color=blue] table {Orig/Proteins.256.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table {VF2PPLabel/Proteins.256.txt}; \end{axis} \end{tikzpicture} -\caption{Both the algorithms have linear behaviour on protein +\caption{Both of the algorithms have linear behaviour on protein graphs. VF2++ is more than 10 times faster than VF2 Plus.} \label{fig:INDProt} \end{figure} \end{subfigure} \end{center} \vspace*{-0.5cm} -\caption{\normalsize{Induced subgraph isomorphism on biological graphs}}\label{fig:bioIND} +\caption{\normalsize{Induced subgraph isomorphism problem on biological graphs}}\label{fig:bioIND} \end{figure} @@ -964,35 +954,36 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{figure}[H] \begin{tikzpicture}[trim axis left, trim axis right] -\begin{axis}[title=Molecules ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title=Molecules ISO,xlabel={$|V_2|$},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \thinspace}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {Orig/moleculesIso.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table {VF2PPLabel/moleculesIso.txt}; \end{axis} \end{tikzpicture} -\caption{In the case of molecules, there is not such a significant - difference, but VF2++ seems to be faster as the number of nodes - increases.}\label{fig:ISOMolecule} +\caption{The results are close to each other on contact maps, but VF2++ seems to be slightly faster as the number of nodes increases. + }\label{fig:ISOMolecule} \end{figure} \end{subfigure} \hspace*{1.5cm} \begin{subfigure}[b]{0.55\textwidth} \begin{figure}[H] \begin{tikzpicture}[trim axis left, trim axis right] -\begin{axis}[title=Contact maps ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title=Contact maps ISO,xlabel={$|V_2|$},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \thinspace}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {Orig/contactMapsIso.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table {VF2PPLabel/contactMapsIso.txt}; \end{axis} \end{tikzpicture} -\caption{The results are closer to each other on contact maps, but - VF2++ still performs consistently better.}\label{fig:ISOContact} +\caption{In the case of molecules, there is no significant + difference, but VF2++ performs consistently better.}\label{fig:ISOContact} \end{figure} \end{subfigure} @@ -1001,38 +992,39 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{figure}[H] \begin{tikzpicture}[trim axis left, trim axis right] -\begin{axis}[title=Proteins ISO,xlabel={target size},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title=Proteins ISO,xlabel={$|V_2|$},ylabel={time (ms)},legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \thinspace}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {Orig/proteinsIso.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table {VF2PPLabel/proteinsIso.txt}; \end{axis} \end{tikzpicture} -\caption{On protein graphs, VF2 Plus has a super linear time +\caption{On protein graphs, VF2~Plus has a super linear time complexity, while VF2++ runs in near constant time. The difference - is about two order of magnitude on large graphs.}\label{fig:ISOProt} + is about two orders of magnitude on large graphs.}\label{fig:ISOProt} \end{figure} \end{subfigure} \end{center} \vspace*{-0.6cm} -\caption{\normalsize{Graph isomorphism on biological graphs}}\label{fig:bioISO} +\caption{\normalsize{Graph isomorphism problem on biological graphs}}\label{fig:bioISO} \end{figure} \subsection{Random graphs} -This section compares VF2++ with VF2 Plus on random graphs of a large +This section compares VF2++ with VF2~Plus on random graphs of large size. The node labels are uniformly distributed. Let $\delta$ denote the average degree. For the parameters of problems solved in the experiments, please see the top of each chart. -\subsubsection{Graph isomorphism} +\subsubsection{Graph isomorphism problem} To evaluate the efficiency of the algorithms in the case of graph -isomorphism, random connected graphs of less than 20 000 nodes have been +isomorphism problem, random connected graphs of less than 20 000 nodes have been considered. Generating a random graph and shuffling its nodes, an -isomorphism had to be found. Figure \ref{fig:randISO} shows the runtime results +isomorphism had to be found. Figure~\ref{fig:randISO} shows the runtime results on graph sets of various density. @@ -1044,10 +1036,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random ISO, $\delta = 5$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random ISO, $\delta = 5$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/iso/vf2pIso5_1.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1060,10 +1053,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random ISO, $\delta = 10$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random ISO, $\delta = 10$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/iso/vf2pIso10_1.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1077,10 +1071,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random ISO, $\delta = 15$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random ISO, $\delta = 15$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/iso/vf2pIso15_1.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1092,10 +1087,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random ISO, $\delta = 100$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random ISO, $\delta = 100$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \thinspace}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/iso/vf2pIso100_1.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1105,26 +1101,23 @@ \end{center} \end{subfigure} \vspace*{-0.8cm} -\caption{ISO on random graphs. +\caption{Graph isomorphism problem on random graphs }\label{fig:randISO} \end{figure} -\subsubsection{Induced subgraph isomorphism} -This section presents a comparison of VF2++ and VF2 Plus in the case -of induced subgraph isomorphism. In addition to the size of the large -graph, that of the small graph dramatically influences the hardness of -a given problem too, so the overall picture is provided by examining -small graphs of various size. +\subsubsection{Induced subgraph isomorphism problem} +This section presents a comparison of VF2++ and VF2~Plus in the case +of induced subgraph isomorphism problem. In addition to the size of graph $G_2$, that of $G_1$ dramatically influences the hardness of +a given problem too, so the overall picture is provided by examining graphs to be embedded of various size. For each chart, a number $0<\rho< 1$ has been fixed, and the following has been executed 150 times. Generating a large graph $G_{2}$ of an average degree of $\delta$, -choose 10 of its induced subgraphs having $\rho\ |V_{2}|$ nodes, -and for all the 10 subgraphs find a mapping by using both the graph +choose 10 of its induced subgraphs having $\rho|V_{2}|$ nodes, +and for all the 10 subgraphs find a mapping by using both graph matching algorithms. The $\delta = 5, 10, 35$ and $\rho = 0.05, 0.1, 0.3, 0.8$ cases have been examined, see -Figure~\ref{fig:randIND5}, \ref{fig:randIND10} and -\ref{fig:randIND35}. +Figure~\ref{fig:randIND5},~\ref{fig:randIND10}~and~\ref{fig:randIND35}. @@ -1136,10 +1129,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd5_0.05.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1151,10 +1145,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd5_0.1.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1167,10 +1162,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd5_0.3.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1182,10 +1178,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 5$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd5_0.8.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1195,8 +1192,8 @@ \end{center} \end{subfigure} \vspace*{-0.8cm} -\caption{IND on graphs having an average degree of - 5.}\label{fig:randIND5} +\caption{Induced subgraph isomorphism problem on random graphs having an average degree of + 5}\label{fig:randIND5} \end{figure} @@ -1207,10 +1204,11 @@ \begin{center} \hspace*{-0.5cm} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd10_0.05.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1223,10 +1221,11 @@ \begin{center} \hspace*{-0.5cm} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd10_0.1.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1239,10 +1238,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd10_0.3.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1254,10 +1254,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 10$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd10_0.8.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1267,8 +1268,8 @@ \end{center} \end{subfigure} \vspace*{-0.8cm} -\caption{IND on graphs having an average degree of - 10.}\label{fig:randIND10} +\caption{Induced subgraph isomorphism problem on random graphs having an average degree of + 10}\label{fig:randIND10} \end{figure} @@ -1279,10 +1280,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.05$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd35_0.05.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1294,10 +1296,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.1$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd35_0.1.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1310,10 +1313,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.3$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd35_0.3.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1325,10 +1329,11 @@ \begin{subfigure}[b]{0.55\textwidth} \begin{center} \begin{tikzpicture} -\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={target size},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid +\begin{axis}[title={Random IND, $\delta = 35$, $\rho = 0.8$},width=7.2cm,height=6cm,xlabel={$|V_2|$},ylabel={time (ms)},ylabel near ticks,legend entries={VF2 Plus,VF2++},grid =major,mark size=1.2pt, legend style={at={(0,1)},anchor=north west},scaled x ticks = false,x tick label style={/pgf/number - format/1000 sep = \space}] + format/1000 sep = \kern 0.08em},y tick label style={/pgf/number + format/1000 sep = \kern 0.08em}] %\addplot+[only marks] table {proteinsOrig.txt}; \addplot table {randGraph/ind/vf2pInd35_0.8.txt}; \addplot[mark=triangle*,mark size=1.8pt,color=red] table @@ -1338,18 +1343,18 @@ \end{center} \end{subfigure} \vspace*{-0.8cm} -\caption{IND on graphs having an average degree of - 35.}\label{fig:randIND35} +\caption{Induced subgraph isomorphism problem on random graphs having an average degree of + 35}\label{fig:randIND35} \end{figure} -Based on these experiments, VF2++ is faster than VF2 Plus and able to +Based on these experiments, VF2++ is faster than VF2~Plus and able to handle really large graphs in milliseconds. Note that when $IND$ was -considered and the small graphs had proportionally few nodes ($\rho = -0.05$, or $\rho = 0.1$), then VF2 Plus produced some inefficient node +considered and the graph to be embedded had proportionally few nodes ($\rho = +0.05$, or $\rho = 0.1$), then VF2~Plus produced some inefficient node orders (e.g. see the $\delta=10$ case on Figure~\ref{fig:randIND10}). If these instances had been excluded, the -charts would have seemed to be similar to the other ones. +charts would have looked similarly to the other ones. Unsurprisingly, as denser graphs are considered, both VF2++ and VF2 Plus slow slightly down, but remain practically usable even on graphs having 10 000 nodes. @@ -1359,34 +1364,33 @@ \section{Conclusion} -This paper presented VF2++, a new graph matching algorithm based on VF2, called VF2++, and analyzed it from a practical viewpoint. +This paper presented VF2++, a new graph matching algorithm based on VF2, and analysed it from a practical viewpoint. Recognizing the importance of the node order and determining an efficient one, VF2++ is able to match graphs of thousands of nodes in near practically linear time including preprocessing. In addition to -the proper order, VF2++ uses more efficient consistency and cutting +the proper order, VF2++ uses more efficient cutting rules which are easy to compute and make the algorithm able to prune most of the unfruitful branches without going astray. In order to show the efficiency of the new method, it has been -compared to VF2 Plus\cite{VF2Plus}, which is the best contemporary algorithm. -. +compared to VF2~Plus~\cite{VF2Plus}, which is the best contemporary algorithm. -The experiments show that VF2++ consistently outperforms VF2 Plus on +The experiments show that VF2++ consistently outperforms VF2~Plus on biological graphs. It seems to be asymptotically faster on protein and -on contact map graphs in the case of induced subgraph isomorphism, -while in the case of graph isomorphism, it has definitely better +on contact map graphs in the case of induced subgraph isomorphism problem, +while in the case of graph isomorphism problem, it has definitely better asymptotic behaviour on protein graphs. Regarding random sparse graphs, not only has VF2++ proved itself to be -faster than VF2 Plus, but it also has a practically linear behaviour both -in the case of induced subgraph- and graph isomorphism. +faster than VF2~Plus, but it also has a practically linear behaviour both +in the case of induced subgraph and graph isomorphism problems. %%%%%%%%%%%%%%%% \section*{Acknowledgement} \label{sec:ack} %%%%%%%%%%%%%%%% This research project was initiated and sponsored by QuantumBio -Inc.\cite{QUANTUMBIO}. +Inc.~\cite{QUANTUMBIO}. The authors were supported by the Hungarian Scientific Research Fund - OTKA, K109240 and by the J\'anos Bolyai Research Fellowship program of