scripts/bib2dox.py
changeset 1184 3c00344f49c9
parent 754 2de0fc630899
child 1052 eb2f9d453070
equal deleted inserted replaced
3:812711987445 -1:000000000000
     1 #! /usr/bin/env python
       
     2 """
       
     3   BibTeX to Doxygen converter
       
     4   Usage: python bib2dox.py bibfile.bib > bibfile.dox
       
     5 
       
     6   This file is a part of LEMON, a generic C++ optimization library.
       
     7 
       
     8   **********************************************************************
       
     9 
       
    10   This code is the modification of the BibTeX to XML converter
       
    11   by Vidar Bronken Gundersen et al.
       
    12   See the original copyright notices below. 
       
    13 
       
    14   **********************************************************************
       
    15 
       
    16   Decoder for bibliographic data, BibTeX
       
    17   Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
       
    18 
       
    19   v.8
       
    20   (c)2002-06-23 Vidar Bronken Gundersen
       
    21   http://bibtexml.sf.net/
       
    22   Reuse approved as long as this notification is kept.
       
    23   Licence: GPL.
       
    24 
       
    25   Contributions/thanks to:
       
    26   Egon Willighagen, http://sf.net/projects/jreferences/
       
    27   Richard Mahoney (for providing a test case)
       
    28 
       
    29   Editted by Sara Sprenkle to be more robust and handle more bibtex features.
       
    30   (c) 2003-01-15
       
    31 
       
    32   1.  Changed bibtex: tags to bibxml: tags.
       
    33   2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
       
    34   3.  Allow spaces between @type and first {
       
    35   4.  "author" fields with multiple authors split by " and "
       
    36       are put in separate xml "bibxml:author" tags.
       
    37   5.  Option for Titles: words are capitalized
       
    38       only if first letter in title or capitalized inside braces
       
    39   6.  Removes braces from within field values
       
    40   7.  Ignores comments in bibtex file (including @comment{ or % )
       
    41   8.  Replaces some special latex tags, e.g., replaces ~ with ' '
       
    42   9.  Handles bibtex @string abbreviations
       
    43         --> includes bibtex's default abbreviations for months
       
    44         --> does concatenation of abbr # " more " and " more " # abbr
       
    45   10. Handles @type( ... ) or @type{ ... }
       
    46   11. The keywords field is split on , or ; and put into separate xml
       
    47       "bibxml:keywords" tags
       
    48   12. Ignores @preamble
       
    49 
       
    50   Known Limitations
       
    51   1.  Does not transform Latex encoding like math mode and special
       
    52       latex symbols.
       
    53   2.  Does not parse author fields into first and last names.
       
    54       E.g., It does not do anything special to an author whose name is
       
    55       in the form LAST_NAME, FIRST_NAME
       
    56       In "author" tag, will show up as
       
    57       <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
       
    58   3.  Does not handle "crossref" fields other than to print
       
    59       <bibxml:crossref>...</bibxml:crossref>
       
    60   4.  Does not inform user of the input's format errors.  You just won't
       
    61       be able to transform the file later with XSL
       
    62 
       
    63   You will have to manually edit the XML output if you need to handle
       
    64   these (and unknown) limitations.
       
    65 
       
    66 """
       
    67 
       
    68 import string, re
       
    69 
       
    70 # set of valid name characters
       
    71 valid_name_chars = '[\w\-:]'
       
    72 
       
    73 #
       
    74 # define global regular expression variables
       
    75 #
       
    76 author_rex = re.compile('\s+and\s+')
       
    77 rembraces_rex = re.compile('[{}]')
       
    78 capitalize_rex = re.compile('({[^}]*})')
       
    79 
       
    80 # used by bibtexkeywords(data)
       
    81 keywords_rex = re.compile('[,;]')
       
    82 
       
    83 # used by concat_line(line)
       
    84 concatsplit_rex = re.compile('\s*#\s*')
       
    85 
       
    86 # split on {, }, or " in verify_out_of_braces
       
    87 delimiter_rex = re.compile('([{}"])',re.I)
       
    88 
       
    89 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
       
    90 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
       
    91 
       
    92 url_rex = re.compile('\\\url\{([^}]*)\}')
       
    93 
       
    94 #
       
    95 # styles for html formatting
       
    96 #
       
    97 divstyle = 'margin-top: -4ex; margin-left: 8em;'
       
    98 
       
    99 #
       
   100 # return the string parameter without braces
       
   101 #
       
   102 def transformurls(str):
       
   103     return url_rex.sub(r'<a href="\1">\1</a>', str)
       
   104 
       
   105 #
       
   106 # return the string parameter without braces
       
   107 #
       
   108 def removebraces(str):
       
   109     return rembraces_rex.sub('', str)
       
   110 
       
   111 #
       
   112 # latex-specific replacements
       
   113 # (do this after braces were removed)
       
   114 #
       
   115 def latexreplacements(line):
       
   116     line = string.replace(line, '~', '&nbsp;')
       
   117     line = string.replace(line, '\\\'a', '&aacute;')
       
   118     line = string.replace(line, '\\"a', '&auml;')
       
   119     line = string.replace(line, '\\\'e', '&eacute;')
       
   120     line = string.replace(line, '\\"e', '&euml;')
       
   121     line = string.replace(line, '\\\'i', '&iacute;')
       
   122     line = string.replace(line, '\\"i', '&iuml;')
       
   123     line = string.replace(line, '\\\'o', '&oacute;')
       
   124     line = string.replace(line, '\\"o', '&ouml;')
       
   125     line = string.replace(line, '\\\'u', '&uacute;')
       
   126     line = string.replace(line, '\\"u', '&uuml;')
       
   127     line = string.replace(line, '\\H o', '&otilde;')
       
   128     line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
       
   129     line = string.replace(line, '\\\'A', '&Aacute;')
       
   130     line = string.replace(line, '\\"A', '&Auml;')
       
   131     line = string.replace(line, '\\\'E', '&Eacute;')
       
   132     line = string.replace(line, '\\"E', '&Euml;')
       
   133     line = string.replace(line, '\\\'I', '&Iacute;')
       
   134     line = string.replace(line, '\\"I', '&Iuml;')
       
   135     line = string.replace(line, '\\\'O', '&Oacute;')
       
   136     line = string.replace(line, '\\"O', '&Ouml;')
       
   137     line = string.replace(line, '\\\'U', '&Uacute;')
       
   138     line = string.replace(line, '\\"U', '&Uuml;')
       
   139     line = string.replace(line, '\\H O', '&Otilde;')
       
   140     line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
       
   141 
       
   142     return line
       
   143 
       
   144 #
       
   145 # copy characters form a string decoding html expressions (&xyz;)
       
   146 #
       
   147 def copychars(str, ifrom, count):
       
   148     result = ''
       
   149     i = ifrom
       
   150     c = 0
       
   151     html_spec = False
       
   152     while (i < len(str)) and (c < count):
       
   153         if str[i] == '&':
       
   154             html_spec = True;
       
   155             if i+1 < len(str):
       
   156                 result += str[i+1]
       
   157             c += 1
       
   158             i += 2
       
   159         else:
       
   160             if not html_spec:
       
   161                 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
       
   162                    ((str[i] >= 'a') and (str[i] <= 'z')):
       
   163                     result += str[i]
       
   164                     c += 1
       
   165             elif str[i] == ';':
       
   166                 html_spec = False;
       
   167             i += 1
       
   168     
       
   169     return result
       
   170 
       
   171 
       
   172 # 
       
   173 # Handle a list of authors (separated by 'and').
       
   174 # It gives back an array of the follwing values:
       
   175 #  - num: the number of authors,
       
   176 #  - list: the list of the author names,
       
   177 #  - text: the bibtex text (separated by commas and/or 'and')
       
   178 #  - abbrev: abbreviation that can be used for indicate the
       
   179 #    bibliography entries
       
   180 #
       
   181 def bibtexauthor(data):
       
   182     result = {}
       
   183     bibtex = ''
       
   184     result['list'] = author_rex.split(data)
       
   185     result['num'] = len(result['list'])
       
   186     for i, author in enumerate(result['list']):
       
   187         # general transformations
       
   188         author = latexreplacements(removebraces(author.strip()))
       
   189         # transform "Xyz, A. B." to "A. B. Xyz"
       
   190         pos = author.find(',')
       
   191         if pos != -1:
       
   192             author = author[pos+1:].strip() + ' ' + author[:pos].strip()
       
   193         result['list'][i] = author
       
   194         bibtex += author + '#'
       
   195     bibtex = bibtex[:-1]
       
   196     if result['num'] > 1:
       
   197         ix = bibtex.rfind('#')
       
   198         if result['num'] == 2:
       
   199             bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
       
   200         else:
       
   201             bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
       
   202     bibtex = bibtex.replace('#', ', ')
       
   203     result['text'] = bibtex
       
   204     
       
   205     result['abbrev'] = ''
       
   206     for author in result['list']:
       
   207         pos = author.rfind(' ') + 1
       
   208         count = 1
       
   209         if result['num'] == 1:
       
   210             count = 3
       
   211         result['abbrev'] += copychars(author, pos, count)
       
   212 
       
   213     return result
       
   214 
       
   215 
       
   216 #
       
   217 # data = title string
       
   218 # @return the capitalized title (first letter is capitalized), rest are capitalized
       
   219 # only if capitalized inside braces
       
   220 #
       
   221 def capitalizetitle(data):
       
   222     title_list = capitalize_rex.split(data)
       
   223     title = ''
       
   224     count = 0
       
   225     for phrase in title_list:
       
   226          check = string.lstrip(phrase)
       
   227 
       
   228          # keep phrase's capitalization the same
       
   229          if check.find('{') == 0:
       
   230               title += removebraces(phrase)
       
   231          else:
       
   232          # first word --> capitalize first letter (after spaces)
       
   233               if count == 0:
       
   234                   title += check.capitalize()
       
   235               else:
       
   236                   title += phrase.lower()
       
   237          count = count + 1
       
   238 
       
   239     return title
       
   240 
       
   241 
       
   242 #
       
   243 # @return the bibtex for the title
       
   244 # @param data --> title string
       
   245 # braces are removed from title
       
   246 #
       
   247 def bibtextitle(data, entrytype):
       
   248     if entrytype in ('book', 'inbook'):
       
   249         title = removebraces(data.strip())
       
   250     else:
       
   251         title = removebraces(capitalizetitle(data.strip()))
       
   252     bibtex = title
       
   253     return bibtex
       
   254 
       
   255 
       
   256 #
       
   257 # function to compare entry lists
       
   258 #
       
   259 def entry_cmp(x, y):
       
   260     return cmp(x[0], y[0])
       
   261 
       
   262 
       
   263 #
       
   264 # print the XML for the transformed "filecont_source"
       
   265 #
       
   266 def bibtexdecoder(filecont_source):
       
   267     filecont = []
       
   268     file = []
       
   269     
       
   270     # want @<alphanumeric chars><spaces>{<spaces><any chars>,
       
   271     pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
       
   272     endtype_rex = re.compile('}\s*$')
       
   273     endtag_rex = re.compile('^\s*}\s*$')
       
   274 
       
   275     bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
       
   276     bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
       
   277 
       
   278     quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
       
   279     quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
       
   280 
       
   281     for line in filecont_source:
       
   282         line = line[:-1]
       
   283 
       
   284         # encode character entities
       
   285         line = string.replace(line, '&', '&amp;')
       
   286         line = string.replace(line, '<', '&lt;')
       
   287         line = string.replace(line, '>', '&gt;')
       
   288 
       
   289         # start entry: publication type (store for later use)
       
   290         if pubtype_rex.match(line):
       
   291         # want @<alphanumeric chars><spaces>{<spaces><any chars>,
       
   292             entrycont = {}
       
   293             entry = []
       
   294             entrytype = pubtype_rex.sub('\g<1>',line)
       
   295             entrytype = string.lower(entrytype)
       
   296             entryid   = pubtype_rex.sub('\g<2>', line)
       
   297 
       
   298         # end entry if just a }
       
   299         elif endtype_rex.match(line):
       
   300             # generate doxygen code for the entry
       
   301 
       
   302             # enty type related formattings
       
   303             if entrytype in ('book', 'inbook'):
       
   304                 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
       
   305                 if not entrycont.has_key('author'):
       
   306                     entrycont['author'] = entrycont['editor']
       
   307                     entrycont['author']['text'] += ', editors'
       
   308             elif entrytype == 'article':
       
   309                 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
       
   310             elif entrytype in ('inproceedings', 'incollection', 'conference'):
       
   311                 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
       
   312             elif entrytype == 'techreport':
       
   313                 if not entrycont.has_key('type'):
       
   314                     entrycont['type'] = 'Technical report'
       
   315             elif entrytype == 'mastersthesis':
       
   316                 entrycont['type'] = 'Master\'s thesis'
       
   317             elif entrytype == 'phdthesis':
       
   318                 entrycont['type'] = 'PhD thesis'
       
   319 
       
   320             for eline in entrycont:
       
   321                 if eline != '':
       
   322                     eline = latexreplacements(eline)
       
   323 
       
   324             if entrycont.has_key('pages') and (entrycont['pages'] != ''):
       
   325                 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
       
   326 
       
   327             if entrycont.has_key('author') and (entrycont['author'] != ''):
       
   328                 entry.append(entrycont['author']['text'] + '.')
       
   329             if entrycont.has_key('title') and (entrycont['title'] != ''):
       
   330                 entry.append(entrycont['title'] + '.')
       
   331             if entrycont.has_key('journal') and (entrycont['journal'] != ''):
       
   332                 entry.append(entrycont['journal'] + ',')
       
   333             if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
       
   334                 entry.append('In ' + entrycont['booktitle'] + ',')
       
   335             if entrycont.has_key('type') and (entrycont['type'] != ''):
       
   336                 eline = entrycont['type']
       
   337                 if entrycont.has_key('number') and (entrycont['number'] != ''):
       
   338                     eline += ' ' + entrycont['number']
       
   339                 eline += ','
       
   340                 entry.append(eline)
       
   341             if entrycont.has_key('institution') and (entrycont['institution'] != ''):
       
   342                 entry.append(entrycont['institution'] + ',')
       
   343             if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
       
   344                 entry.append(entrycont['publisher'] + ',')
       
   345             if entrycont.has_key('school') and (entrycont['school'] != ''):
       
   346                 entry.append(entrycont['school'] + ',')
       
   347             if entrycont.has_key('address') and (entrycont['address'] != ''):
       
   348                 entry.append(entrycont['address'] + ',')
       
   349             if entrycont.has_key('edition') and (entrycont['edition'] != ''):
       
   350                 entry.append(entrycont['edition'] + ' edition,')
       
   351             if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
       
   352                 entry.append(entrycont['howpublished'] + ',')
       
   353             if entrycont.has_key('volume') and (entrycont['volume'] != ''):
       
   354                 eline = entrycont['volume'];
       
   355                 if entrycont.has_key('number') and (entrycont['number'] != ''):
       
   356                     eline += '(' + entrycont['number'] + ')'
       
   357                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
       
   358                     eline += ':' + entrycont['pages']
       
   359                 eline += ','
       
   360                 entry.append(eline)
       
   361             else:
       
   362                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
       
   363                     entry.append('pages ' + entrycont['pages'] + ',')
       
   364             if entrycont.has_key('year') and (entrycont['year'] != ''):
       
   365                 if entrycont.has_key('month') and (entrycont['month'] != ''):
       
   366                     entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
       
   367                 else:
       
   368                     entry.append(entrycont['year'] + '.')
       
   369             if entrycont.has_key('note') and (entrycont['note'] != ''):
       
   370                 entry.append(entrycont['note'] + '.')
       
   371             if entrycont.has_key('url') and (entrycont['url'] != ''):
       
   372                 entry.append(entrycont['url'] + '.')
       
   373 
       
   374             # generate keys for sorting and for the output
       
   375             sortkey = ''
       
   376             bibkey = ''
       
   377             if entrycont.has_key('author'):
       
   378                 for author in entrycont['author']['list']:
       
   379                     sortkey += copychars(author, author.rfind(' ')+1, len(author))
       
   380                 bibkey = entrycont['author']['abbrev']
       
   381             else:
       
   382                 bibkey = 'x'
       
   383             if entrycont.has_key('year'):
       
   384                 sortkey += entrycont['year']
       
   385                 bibkey += entrycont['year'][-2:]
       
   386             if entrycont.has_key('title'):
       
   387                 sortkey += entrycont['title']
       
   388             if entrycont.has_key('key'):
       
   389                 sortkey = entrycont['key'] + sortkey
       
   390                 bibkey = entrycont['key']
       
   391             entry.insert(0, sortkey)
       
   392             entry.insert(1, bibkey)
       
   393             entry.insert(2, entryid)
       
   394            
       
   395             # add the entry to the file contents
       
   396             filecont.append(entry)
       
   397 
       
   398         else:
       
   399             # field, publication info
       
   400             field = ''
       
   401             data = ''
       
   402             
       
   403             # field = {data} entries
       
   404             if bracedata_rex.match(line):
       
   405                 field = bracefield_rex.sub('\g<1>', line)
       
   406                 field = string.lower(field)
       
   407                 data =  bracedata_rex.sub('\g<2>', line)
       
   408 
       
   409             # field = "data" entries
       
   410             elif quotedata_rex.match(line):
       
   411                 field = quotefield_rex.sub('\g<1>', line)
       
   412                 field = string.lower(field)
       
   413                 data =  quotedata_rex.sub('\g<2>', line)
       
   414 
       
   415             # field = data entries
       
   416             elif data_rex.match(line):
       
   417                 field = field_rex.sub('\g<1>', line)
       
   418                 field = string.lower(field)
       
   419                 data =  data_rex.sub('\g<2>', line)
       
   420 
       
   421             if field == 'url':
       
   422                 data = '\\url{' + data.strip() + '}'
       
   423             
       
   424             if field in ('author', 'editor'):
       
   425                 entrycont[field] = bibtexauthor(data)
       
   426                 line = ''
       
   427             elif field == 'title':
       
   428                 line = bibtextitle(data, entrytype)
       
   429             elif field != '':
       
   430                 line = removebraces(transformurls(data.strip()))
       
   431 
       
   432             if line != '':
       
   433                 line = latexreplacements(line)
       
   434                 entrycont[field] = line
       
   435 
       
   436 
       
   437     # sort entries
       
   438     filecont.sort(entry_cmp)
       
   439     
       
   440     # count the bibtex keys
       
   441     keytable = {}
       
   442     counttable = {}
       
   443     for entry in filecont:
       
   444         bibkey = entry[1]
       
   445         if not keytable.has_key(bibkey):
       
   446             keytable[bibkey] = 1
       
   447         else:
       
   448             keytable[bibkey] += 1
       
   449 
       
   450     for bibkey in keytable.keys():
       
   451         counttable[bibkey] = 0
       
   452     
       
   453     # generate output
       
   454     for entry in filecont:
       
   455         # generate output key form the bibtex key
       
   456         bibkey = entry[1]
       
   457         entryid = entry[2]
       
   458         if keytable[bibkey] == 1:
       
   459             outkey = bibkey
       
   460         else:
       
   461             outkey = bibkey + chr(97 + counttable[bibkey])
       
   462         counttable[bibkey] += 1
       
   463         
       
   464         # append the entry code to the output
       
   465         file.append('\\section ' + entryid + ' [' + outkey + ']')
       
   466         file.append('<div style="' + divstyle + '">')
       
   467         for line in entry[3:]:
       
   468             file.append(line)
       
   469         file.append('</div>')
       
   470         file.append('')
       
   471 
       
   472     return file
       
   473 
       
   474 
       
   475 #
       
   476 # return 1 iff abbr is in line but not inside braces or quotes
       
   477 # assumes that abbr appears only once on the line (out of braces and quotes)
       
   478 #
       
   479 def verify_out_of_braces(line, abbr):
       
   480 
       
   481     phrase_split = delimiter_rex.split(line)
       
   482 
       
   483     abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
       
   484 
       
   485     open_brace = 0
       
   486     open_quote = 0
       
   487 
       
   488     for phrase in phrase_split:
       
   489         if phrase == "{":
       
   490             open_brace = open_brace + 1
       
   491         elif phrase == "}":
       
   492             open_brace = open_brace - 1
       
   493         elif phrase == '"':
       
   494             if open_quote == 1:
       
   495                 open_quote = 0
       
   496             else:
       
   497                 open_quote = 1
       
   498         elif abbr_rex.search(phrase):
       
   499             if open_brace == 0 and open_quote == 0:
       
   500                 return 1
       
   501 
       
   502     return 0
       
   503 
       
   504 
       
   505 #
       
   506 # a line in the form phrase1 # phrase2 # ... # phrasen
       
   507 # is returned as phrase1 phrase2 ... phrasen
       
   508 # with the correct punctuation
       
   509 # Bug: Doesn't always work with multiple abbreviations plugged in
       
   510 #
       
   511 def concat_line(line):
       
   512     # only look at part after equals
       
   513     field = field_rex.sub('\g<1>',line)
       
   514     rest = field_rex.sub('\g<2>',line)
       
   515 
       
   516     concat_line = field + ' ='
       
   517 
       
   518     pound_split = concatsplit_rex.split(rest)
       
   519 
       
   520     phrase_count = 0
       
   521     length = len(pound_split)
       
   522 
       
   523     for phrase in pound_split:
       
   524         phrase = phrase.strip()
       
   525         if phrase_count != 0:
       
   526             if phrase.startswith('"') or phrase.startswith('{'):
       
   527                 phrase = phrase[1:]
       
   528         elif phrase.startswith('"'):
       
   529             phrase = phrase.replace('"','{',1)
       
   530 
       
   531         if phrase_count != length-1:
       
   532             if phrase.endswith('"') or phrase.endswith('}'):
       
   533                 phrase = phrase[:-1]
       
   534         else:
       
   535             if phrase.endswith('"'):
       
   536                 phrase = phrase[:-1]
       
   537                 phrase = phrase + "}"
       
   538             elif phrase.endswith('",'):
       
   539                 phrase = phrase[:-2]
       
   540                 phrase = phrase + "},"
       
   541 
       
   542         # if phrase did have \#, add the \# back
       
   543         if phrase.endswith('\\'):
       
   544             phrase = phrase + "#"
       
   545         concat_line = concat_line + ' ' + phrase
       
   546 
       
   547         phrase_count = phrase_count + 1
       
   548 
       
   549     return concat_line
       
   550 
       
   551 
       
   552 #
       
   553 # substitute abbreviations into filecont
       
   554 # @param filecont_source - string of data from file
       
   555 #
       
   556 def bibtex_replace_abbreviations(filecont_source):
       
   557     filecont = filecont_source.splitlines()
       
   558 
       
   559     #  These are defined in bibtex, so we'll define them too
       
   560     abbr_list = ['jan','feb','mar','apr','may','jun',
       
   561                  'jul','aug','sep','oct','nov','dec']
       
   562     value_list = ['January','February','March','April',
       
   563                   'May','June','July','August','September',
       
   564                   'October','November','December']
       
   565 
       
   566     abbr_rex = []
       
   567     total_abbr_count = 0
       
   568 
       
   569     front = '\\b'
       
   570     back = '(,?)\\b'
       
   571 
       
   572     for x in abbr_list:
       
   573         abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
       
   574         total_abbr_count = total_abbr_count + 1
       
   575 
       
   576 
       
   577     abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
       
   578                              re.I)
       
   579 
       
   580     comment_rex = re.compile('@comment\s*{',re.I)
       
   581     preamble_rex = re.compile('@preamble\s*{',re.I)
       
   582 
       
   583     waiting_for_end_string = 0
       
   584     i = 0
       
   585     filecont2 = ''
       
   586 
       
   587     for line in filecont:
       
   588         if line == ' ' or line == '':
       
   589             continue
       
   590 
       
   591         if waiting_for_end_string:
       
   592             if re.search('}',line):
       
   593                 waiting_for_end_string = 0
       
   594                 continue
       
   595 
       
   596         if abbrdef_rex.search(line):
       
   597             abbr = abbrdef_rex.sub('\g<1>', line)
       
   598 
       
   599             if abbr_list.count(abbr) == 0:
       
   600                 val = abbrdef_rex.sub('\g<2>', line)
       
   601                 abbr_list.append(abbr)
       
   602                 value_list.append(string.strip(val))
       
   603                 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
       
   604                 total_abbr_count = total_abbr_count + 1
       
   605             waiting_for_end_string = 1
       
   606             continue
       
   607 
       
   608         if comment_rex.search(line):
       
   609             waiting_for_end_string = 1
       
   610             continue
       
   611 
       
   612         if preamble_rex.search(line):
       
   613             waiting_for_end_string = 1
       
   614             continue
       
   615 
       
   616 
       
   617         # replace subsequent abbreviations with the value
       
   618         abbr_count = 0
       
   619 
       
   620         for x in abbr_list:
       
   621 
       
   622             if abbr_rex[abbr_count].search(line):
       
   623                 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
       
   624                     line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
       
   625                 # Check for # concatenations
       
   626                 if concatsplit_rex.search(line):
       
   627                     line = concat_line(line)
       
   628             abbr_count = abbr_count + 1
       
   629 
       
   630 
       
   631         filecont2 = filecont2 + line + '\n'
       
   632         i = i+1
       
   633 
       
   634 
       
   635     # Do one final pass over file
       
   636 
       
   637     # make sure that didn't end up with {" or }" after the substitution
       
   638     filecont2 = filecont2.replace('{"','{{')
       
   639     filecont2 = filecont2.replace('"}','}}')
       
   640 
       
   641     afterquotevalue_rex = re.compile('"\s*,\s*')
       
   642     afterbrace_rex = re.compile('"\s*}')
       
   643     afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
       
   644 
       
   645     # add new lines to data that changed because of abbreviation substitutions
       
   646     filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
       
   647     filecont2 = afterbrace_rex.sub('"\n}', filecont2)
       
   648     filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
       
   649 
       
   650     return filecont2
       
   651 
       
   652 #
       
   653 # convert @type( ... ) to @type{ ... }
       
   654 #
       
   655 def no_outer_parens(filecont):
       
   656 
       
   657     # do checking for open parens
       
   658     # will convert to braces
       
   659     paren_split = re.split('([(){}])',filecont)
       
   660 
       
   661     open_paren_count = 0
       
   662     open_type = 0
       
   663     look_next = 0
       
   664 
       
   665     # rebuild filecont
       
   666     filecont = ''
       
   667 
       
   668     at_rex = re.compile('@\w*')
       
   669 
       
   670     for phrase in paren_split:
       
   671         if look_next == 1:
       
   672             if phrase == '(':
       
   673                 phrase = '{'
       
   674                 open_paren_count = open_paren_count + 1
       
   675             else:
       
   676                 open_type = 0
       
   677             look_next = 0
       
   678 
       
   679         if phrase == '(':
       
   680             open_paren_count = open_paren_count + 1
       
   681 
       
   682         elif phrase == ')':
       
   683             open_paren_count = open_paren_count - 1
       
   684             if open_type == 1 and open_paren_count == 0:
       
   685                 phrase = '}'
       
   686                 open_type = 0
       
   687 
       
   688         elif at_rex.search( phrase ):
       
   689             open_type = 1
       
   690             look_next = 1
       
   691 
       
   692         filecont = filecont + phrase
       
   693 
       
   694     return filecont
       
   695 
       
   696 
       
   697 #
       
   698 # make all whitespace into just one space
       
   699 # format the bibtex file into a usable form.
       
   700 #
       
   701 def bibtexwasher(filecont_source):
       
   702 
       
   703     space_rex = re.compile('\s+')
       
   704     comment_rex = re.compile('\s*%')
       
   705 
       
   706     filecont = []
       
   707 
       
   708     # remove trailing and excessive whitespace
       
   709     # ignore comments
       
   710     for line in filecont_source:
       
   711         line = string.strip(line)
       
   712         line = space_rex.sub(' ', line)
       
   713         # ignore comments
       
   714         if not comment_rex.match(line) and line != '':
       
   715             filecont.append(' '+ line)
       
   716 
       
   717     filecont = string.join(filecont, '')
       
   718 
       
   719     # the file is in one long string
       
   720 
       
   721     filecont = no_outer_parens(filecont)
       
   722 
       
   723     #
       
   724     # split lines according to preferred syntax scheme
       
   725     #
       
   726     filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
       
   727 
       
   728     # add new lines after commas that are after values
       
   729     filecont = re.sub('"\s*,', '",\n', filecont)
       
   730     filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
       
   731     filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
       
   732                           '\n\n\g<1>\g<2>,\n', filecont)
       
   733 
       
   734     # add new lines after }
       
   735     filecont = re.sub('"\s*}','"\n}\n', filecont)
       
   736     filecont = re.sub('}\s*,','},\n', filecont)
       
   737 
       
   738 
       
   739     filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
       
   740 
       
   741     # character encoding, reserved latex characters
       
   742     filecont = re.sub('{\\\&}', '&', filecont)
       
   743     filecont = re.sub('\\\&', '&', filecont)
       
   744 
       
   745     # do checking for open braces to get format correct
       
   746     open_brace_count = 0
       
   747     brace_split = re.split('([{}])',filecont)
       
   748 
       
   749     # rebuild filecont
       
   750     filecont = ''
       
   751 
       
   752     for phrase in brace_split:
       
   753         if phrase == '{':
       
   754             open_brace_count = open_brace_count + 1
       
   755         elif phrase == '}':
       
   756             open_brace_count = open_brace_count - 1
       
   757             if open_brace_count == 0:
       
   758                 filecont = filecont + '\n'
       
   759 
       
   760         filecont = filecont + phrase
       
   761 
       
   762     filecont2 = bibtex_replace_abbreviations(filecont)
       
   763 
       
   764     # gather
       
   765     filecont = filecont2.splitlines()
       
   766     i=0
       
   767     j=0         # count the number of blank lines
       
   768     for line in filecont:
       
   769         # ignore blank lines
       
   770         if line == '' or line == ' ':
       
   771             j = j+1
       
   772             continue
       
   773         filecont[i] = line + '\n'
       
   774         i = i+1
       
   775 
       
   776     # get rid of the extra stuff at the end of the array
       
   777     # (The extra stuff are duplicates that are in the array because
       
   778     # blank lines were removed.)
       
   779     length = len( filecont)
       
   780     filecont[length-j:length] = []
       
   781 
       
   782     return filecont
       
   783 
       
   784 
       
   785 def filehandler(filepath):
       
   786     try:
       
   787         fd = open(filepath, 'r')
       
   788         filecont_source = fd.readlines()
       
   789         fd.close()
       
   790     except:
       
   791         print 'Could not open file:', filepath
       
   792     washeddata = bibtexwasher(filecont_source)
       
   793     outdata = bibtexdecoder(washeddata)
       
   794     print '/**'
       
   795     print '\page references References'
       
   796     print
       
   797     for line in outdata:
       
   798         print line
       
   799     print '*/'
       
   800 
       
   801 
       
   802 # main program
       
   803 
       
   804 def main():
       
   805     import sys
       
   806     if sys.argv[1:]:
       
   807         filepath = sys.argv[1]
       
   808     else:
       
   809         print "No input file"
       
   810         sys.exit()
       
   811     filehandler(filepath)
       
   812 
       
   813 if __name__ == "__main__": main()
       
   814 
       
   815 
       
   816 # end python script