scripts/bib2dox.py
author Alpar Juttner <alpar@cs.elte.hu>
Mon, 05 Oct 2009 20:21:54 +0200
changeset 797 b7e3662faf02
parent 790 94ef0a5c0005
child 801 2de0fc630899
permissions -rw-r--r--
Merge #317
     1 #!/usr/bin/env /usr/local/Python/bin/python2.1
     2 """
     3   BibTeX to Doxygen converter
     4   Usage: python bib2dox.py bibfile.bib > bibfile.dox
     5 
     6   This code is the modification of the BibTeX to XML converter
     7   by Vidar Bronken Gundersen et al. See the original copyright notices below. 
     8 
     9   **********************************************************************
    10 
    11   Decoder for bibliographic data, BibTeX
    12   Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
    13 
    14   v.8
    15   (c)2002-06-23 Vidar Bronken Gundersen
    16   http://bibtexml.sf.net/
    17   Reuse approved as long as this notification is kept.
    18   Licence: GPL.
    19 
    20   Contributions/thanks to:
    21   Egon Willighagen, http://sf.net/projects/jreferences/
    22   Richard Mahoney (for providing a test case)
    23 
    24   Editted by Sara Sprenkle to be more robust and handle more bibtex features.
    25   (c) 2003-01-15
    26 
    27   1.  Changed bibtex: tags to bibxml: tags.
    28   2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
    29   3.  Allow spaces between @type and first {
    30   4.  "author" fields with multiple authors split by " and "
    31       are put in separate xml "bibxml:author" tags.
    32   5.  Option for Titles: words are capitalized
    33       only if first letter in title or capitalized inside braces
    34   6.  Removes braces from within field values
    35   7.  Ignores comments in bibtex file (including @comment{ or % )
    36   8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'
    37   9.  Handles bibtex @string abbreviations
    38         --> includes bibtex's default abbreviations for months
    39         --> does concatenation of abbr # " more " and " more " # abbr
    40   10. Handles @type( ... ) or @type{ ... }
    41   11. The keywords field is split on , or ; and put into separate xml
    42       "bibxml:keywords" tags
    43   12. Ignores @preamble
    44 
    45   Known Limitations
    46   1.  Does not transform Latex encoding like math mode and special
    47       latex symbols.
    48   2.  Does not parse author fields into first and last names.
    49       E.g., It does not do anything special to an author whose name is
    50       in the form LAST_NAME, FIRST_NAME
    51       In "author" tag, will show up as
    52       <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
    53   3.  Does not handle "crossref" fields other than to print
    54       <bibxml:crossref>...</bibxml:crossref>
    55   4.  Does not inform user of the input's format errors.  You just won't
    56       be able to transform the file later with XSL
    57 
    58   You will have to manually edit the XML output if you need to handle
    59   these (and unknown) limitations.
    60 
    61 """
    62 
    63 import string, re
    64 
    65 # set of valid name characters
    66 valid_name_chars = '[\w\-:]'
    67 
    68 #
    69 # define global regular expression variables
    70 #
    71 author_rex = re.compile('\s+and\s+')
    72 rembraces_rex = re.compile('[{}]')
    73 capitalize_rex = re.compile('({\w*})')
    74 
    75 # used by bibtexkeywords(data)
    76 keywords_rex = re.compile('[,;]')
    77 
    78 # used by concat_line(line)
    79 concatsplit_rex = re.compile('\s*#\s*')
    80 
    81 # split on {, }, or " in verify_out_of_braces
    82 delimiter_rex = re.compile('([{}"])',re.I)
    83 
    84 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
    85 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
    86 
    87 url_rex = re.compile('\\\url\{([^}]*)\}')
    88 
    89 #
    90 # styles for html formatting
    91 #
    92 divstyle = 'margin-top: -4ex; margin-left: 8em;'
    93 
    94 #
    95 # return the string parameter without braces
    96 #
    97 def transformurls(str):
    98     return url_rex.sub(r'<a href="\1">\1</a>', str)
    99 
   100 #
   101 # return the string parameter without braces
   102 #
   103 def removebraces(str):
   104     return rembraces_rex.sub('', str)
   105 
   106 #
   107 # latex-specific replacements
   108 # (do this after braces were removed)
   109 #
   110 def latexreplacements(line):
   111     line = string.replace(line, '~', '&nbsp;')
   112     line = string.replace(line, '\\\'a', '&aacute;')
   113     line = string.replace(line, '\\"a', '&auml;')
   114     line = string.replace(line, '\\\'e', '&eacute;')
   115     line = string.replace(line, '\\"e', '&euml;')
   116     line = string.replace(line, '\\\'i', '&iacute;')
   117     line = string.replace(line, '\\"i', '&iuml;')
   118     line = string.replace(line, '\\\'o', '&oacute;')
   119     line = string.replace(line, '\\"o', '&ouml;')
   120     line = string.replace(line, '\\\'u', '&uacute;')
   121     line = string.replace(line, '\\"u', '&uuml;')
   122     line = string.replace(line, '\\H o', '&otilde;')
   123     line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
   124     line = string.replace(line, '\\\'A', '&Aacute;')
   125     line = string.replace(line, '\\"A', '&Auml;')
   126     line = string.replace(line, '\\\'E', '&Eacute;')
   127     line = string.replace(line, '\\"E', '&Euml;')
   128     line = string.replace(line, '\\\'I', '&Iacute;')
   129     line = string.replace(line, '\\"I', '&Iuml;')
   130     line = string.replace(line, '\\\'O', '&Oacute;')
   131     line = string.replace(line, '\\"O', '&Ouml;')
   132     line = string.replace(line, '\\\'U', '&Uacute;')
   133     line = string.replace(line, '\\"U', '&Uuml;')
   134     line = string.replace(line, '\\H O', '&Otilde;')
   135     line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
   136 
   137     return line
   138 
   139 #
   140 # copy characters form a string decoding html expressions (&xyz;)
   141 #
   142 def copychars(str, ifrom, count):
   143     result = ''
   144     i = ifrom
   145     c = 0
   146     html_spec = False
   147     while (i < len(str)) and (c < count):
   148         if str[i] == '&':
   149             html_spec = True;
   150             if i+1 < len(str):
   151                 result += str[i+1]
   152             c += 1
   153             i += 2
   154         else:
   155             if not html_spec:
   156                 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
   157                    ((str[i] >= 'a') and (str[i] <= 'z')):
   158                     result += str[i]
   159                     c += 1
   160             elif str[i] == ';':
   161                 html_spec = False;
   162             i += 1
   163     
   164     return result
   165 
   166 
   167 # 
   168 # Handle a list of authors (separated by 'and').
   169 # It gives back an array of the follwing values:
   170 #  - num: the number of authors,
   171 #  - list: the list of the author names,
   172 #  - text: the bibtex text (separated by commas and/or 'and')
   173 #  - abbrev: abbreviation that can be used for indicate the
   174 #    bibliography entries
   175 #
   176 def bibtexauthor(data):
   177     result = {}
   178     bibtex = ''
   179     result['list'] = author_rex.split(data)
   180     result['num'] = len(result['list'])
   181     for i, author in enumerate(result['list']):
   182         # general transformations
   183         author = latexreplacements(removebraces(author.strip()))
   184         # transform "Xyz, A. B." to "A. B. Xyz"
   185         pos = author.find(',')
   186         if pos != -1:
   187             author = author[pos+1:].strip() + ' ' + author[:pos].strip()
   188         result['list'][i] = author
   189         bibtex += author + '#'
   190     bibtex = bibtex[:-1]
   191     if result['num'] > 1:
   192         ix = bibtex.rfind('#')
   193         if result['num'] == 2:
   194             bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
   195         else:
   196             bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
   197     bibtex = bibtex.replace('#', ', ')
   198     result['text'] = bibtex
   199     
   200     result['abbrev'] = ''
   201     for author in result['list']:
   202         pos = author.rfind(' ') + 1
   203         count = 1
   204         if result['num'] == 1:
   205             count = 3
   206         result['abbrev'] += copychars(author, pos, count)
   207 
   208     return result
   209 
   210 
   211 #
   212 # data = title string
   213 # @return the capitalized title (first letter is capitalized), rest are capitalized
   214 # only if capitalized inside braces
   215 #
   216 def capitalizetitle(data):
   217     title_list = capitalize_rex.split(data)
   218     title = ''
   219     count = 0
   220     for phrase in title_list:
   221          check = string.lstrip(phrase)
   222 
   223          # keep phrase's capitalization the same
   224          if check.find('{') == 0:
   225               title += removebraces(phrase)
   226          else:
   227          # first word --> capitalize first letter (after spaces)
   228               if count == 0:
   229                   title += check.capitalize()
   230               else:
   231                   title += phrase.lower()
   232          count = count + 1
   233 
   234     return title
   235 
   236 
   237 #
   238 # @return the bibtex for the title
   239 # @param data --> title string
   240 # braces are removed from title
   241 #
   242 def bibtextitle(data, entrytype):
   243     if entrytype in ('book', 'inbook'):
   244         title = removebraces(data.strip())
   245     else:
   246         title = removebraces(capitalizetitle(data.strip()))
   247     bibtex = title
   248     return bibtex
   249 
   250 
   251 #
   252 # function to compare entry lists
   253 #
   254 def entry_cmp(x, y):
   255     return cmp(x[0], y[0])
   256 
   257 
   258 #
   259 # print the XML for the transformed "filecont_source"
   260 #
   261 def bibtexdecoder(filecont_source):
   262     filecont = []
   263     file = []
   264     
   265     # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   266     pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
   267     endtype_rex = re.compile('}\s*$')
   268     endtag_rex = re.compile('^\s*}\s*$')
   269 
   270     bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   271     bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
   272 
   273     quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   274     quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
   275 
   276     for line in filecont_source:
   277         line = line[:-1]
   278 
   279         # encode character entities
   280         line = string.replace(line, '&', '&amp;')
   281         line = string.replace(line, '<', '&lt;')
   282         line = string.replace(line, '>', '&gt;')
   283 
   284         # start entry: publication type (store for later use)
   285         if pubtype_rex.match(line):
   286         # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   287             entrycont = {}
   288             entry = []
   289             entrytype = pubtype_rex.sub('\g<1>',line)
   290             entrytype = string.lower(entrytype)
   291             entryid   = pubtype_rex.sub('\g<2>', line)
   292 
   293         # end entry if just a }
   294         elif endtype_rex.match(line):
   295             # generate doxygen code for the entry
   296 
   297             # enty type related formattings
   298             if entrytype in ('book', 'inbook'):
   299                 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
   300                 if not entrycont.has_key('author'):
   301                     entrycont['author'] = entrycont['editor']
   302                     entrycont['author']['text'] += ', editors'
   303             elif entrytype == 'article':
   304                 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
   305             elif entrytype in ('inproceedings', 'incollection', 'conference'):
   306                 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
   307             elif entrytype == 'techreport':
   308                 if not entrycont.has_key('type'):
   309                     entrycont['type'] = 'Technical report'
   310             elif entrytype == 'mastersthesis':
   311                 entrycont['type'] = 'Master\'s thesis'
   312             elif entrytype == 'phdthesis':
   313                 entrycont['type'] = 'PhD thesis'
   314 
   315             for eline in entrycont:
   316                 if eline != '':
   317                     eline = latexreplacements(eline)
   318 
   319             if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   320                 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
   321 
   322             if entrycont.has_key('author') and (entrycont['author'] != ''):
   323                 entry.append(entrycont['author']['text'] + '.')
   324             if entrycont.has_key('title') and (entrycont['title'] != ''):
   325                 entry.append(entrycont['title'] + '.')
   326             if entrycont.has_key('journal') and (entrycont['journal'] != ''):
   327                 entry.append(entrycont['journal'] + ',')
   328             if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
   329                 entry.append('In ' + entrycont['booktitle'] + ',')
   330             if entrycont.has_key('type') and (entrycont['type'] != ''):
   331                 eline = entrycont['type']
   332                 if entrycont.has_key('number') and (entrycont['number'] != ''):
   333                     eline += ' ' + entrycont['number']
   334                 eline += ','
   335                 entry.append(eline)
   336             if entrycont.has_key('institution') and (entrycont['institution'] != ''):
   337                 entry.append(entrycont['institution'] + ',')
   338             if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
   339                 entry.append(entrycont['publisher'] + ',')
   340             if entrycont.has_key('school') and (entrycont['school'] != ''):
   341                 entry.append(entrycont['school'] + ',')
   342             if entrycont.has_key('address') and (entrycont['address'] != ''):
   343                 entry.append(entrycont['address'] + ',')
   344             if entrycont.has_key('edition') and (entrycont['edition'] != ''):
   345                 entry.append(entrycont['edition'] + ' edition,')
   346             if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
   347                 entry.append(entrycont['howpublished'] + ',')
   348             if entrycont.has_key('volume') and (entrycont['volume'] != ''):
   349                 eline = entrycont['volume'];
   350                 if entrycont.has_key('number') and (entrycont['number'] != ''):
   351                     eline += '(' + entrycont['number'] + ')'
   352                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   353                     eline += ':' + entrycont['pages']
   354                 eline += ','
   355                 entry.append(eline)
   356             else:
   357                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   358                     entry.append('pages ' + entrycont['pages'] + ',')
   359             if entrycont.has_key('year') and (entrycont['year'] != ''):
   360                 if entrycont.has_key('month') and (entrycont['month'] != ''):
   361                     entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
   362                 else:
   363                     entry.append(entrycont['year'] + '.')
   364             if entrycont.has_key('note') and (entrycont['note'] != ''):
   365                 entry.append(entrycont['note'] + '.')
   366 
   367             # generate keys for sorting and for the output
   368             sortkey = ''
   369             bibkey = ''
   370             if entrycont.has_key('author'):
   371                 for author in entrycont['author']['list']:
   372                     sortkey += copychars(author, author.rfind(' ')+1, len(author))
   373                 bibkey = entrycont['author']['abbrev']
   374             else:
   375                 bibkey = 'x'
   376             if entrycont.has_key('year'):
   377                 sortkey += entrycont['year']
   378                 bibkey += entrycont['year'][-2:]
   379             if entrycont.has_key('title'):
   380                 sortkey += entrycont['title']
   381             if entrycont.has_key('key'):
   382                 sortkey = entrycont['key'] + sortkey
   383                 bibkey = entrycont['key']
   384             entry.insert(0, sortkey)
   385             entry.insert(1, bibkey)
   386             entry.insert(2, entryid)
   387            
   388             # add the entry to the file contents
   389             filecont.append(entry)
   390 
   391         else:
   392             # field, publication info
   393             field = ''
   394             data = ''
   395             
   396             # field = {data} entries
   397             if bracedata_rex.match(line):
   398                 field = bracefield_rex.sub('\g<1>', line)
   399                 field = string.lower(field)
   400                 data =  bracedata_rex.sub('\g<2>', line)
   401 
   402             # field = "data" entries
   403             elif quotedata_rex.match(line):
   404                 field = quotefield_rex.sub('\g<1>', line)
   405                 field = string.lower(field)
   406                 data =  quotedata_rex.sub('\g<2>', line)
   407 
   408             # field = data entries
   409             elif data_rex.match(line):
   410                 field = field_rex.sub('\g<1>', line)
   411                 field = string.lower(field)
   412                 data =  data_rex.sub('\g<2>', line)
   413             
   414             if field in ('author', 'editor'):
   415                 entrycont[field] = bibtexauthor(data)
   416                 line = ''
   417             elif field == 'title':
   418                 line = bibtextitle(data, entrytype)
   419             elif field != '':
   420                 line = removebraces(transformurls(data.strip()))
   421 
   422             if line != '':
   423                 line = latexreplacements(line)
   424                 entrycont[field] = line
   425 
   426 
   427     # sort entries
   428     filecont.sort(entry_cmp)
   429     
   430     # count the bibtex keys
   431     keytable = {}
   432     counttable = {}
   433     for entry in filecont:
   434         bibkey = entry[1]
   435         if not keytable.has_key(bibkey):
   436             keytable[bibkey] = 1
   437         else:
   438             keytable[bibkey] += 1
   439 
   440     for bibkey in keytable.keys():
   441         counttable[bibkey] = 0
   442     
   443     # generate output
   444     for entry in filecont:
   445         # generate output key form the bibtex key
   446         bibkey = entry[1]
   447         entryid = entry[2]
   448         if keytable[bibkey] == 1:
   449             outkey = bibkey
   450         else:
   451             outkey = bibkey + chr(97 + counttable[bibkey])
   452         counttable[bibkey] += 1
   453         
   454         # append the entry code to the output
   455         file.append('\\section ' + entryid + ' [' + outkey + ']')
   456         file.append('<div style="' + divstyle + '">')
   457         for line in entry[3:]:
   458             file.append(line)
   459         file.append('</div>')
   460         file.append('')
   461 
   462     return file
   463 
   464 
   465 #
   466 # return 1 iff abbr is in line but not inside braces or quotes
   467 # assumes that abbr appears only once on the line (out of braces and quotes)
   468 #
   469 def verify_out_of_braces(line, abbr):
   470 
   471     phrase_split = delimiter_rex.split(line)
   472 
   473     abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
   474 
   475     open_brace = 0
   476     open_quote = 0
   477 
   478     for phrase in phrase_split:
   479         if phrase == "{":
   480             open_brace = open_brace + 1
   481         elif phrase == "}":
   482             open_brace = open_brace - 1
   483         elif phrase == '"':
   484             if open_quote == 1:
   485                 open_quote = 0
   486             else:
   487                 open_quote = 1
   488         elif abbr_rex.search(phrase):
   489             if open_brace == 0 and open_quote == 0:
   490                 return 1
   491 
   492     return 0
   493 
   494 
   495 #
   496 # a line in the form phrase1 # phrase2 # ... # phrasen
   497 # is returned as phrase1 phrase2 ... phrasen
   498 # with the correct punctuation
   499 # Bug: Doesn't always work with multiple abbreviations plugged in
   500 #
   501 def concat_line(line):
   502     # only look at part after equals
   503     field = field_rex.sub('\g<1>',line)
   504     rest = field_rex.sub('\g<2>',line)
   505 
   506     concat_line = field + ' ='
   507 
   508     pound_split = concatsplit_rex.split(rest)
   509 
   510     phrase_count = 0
   511     length = len(pound_split)
   512 
   513     for phrase in pound_split:
   514         phrase = phrase.strip()
   515         if phrase_count != 0:
   516             if phrase.startswith('"') or phrase.startswith('{'):
   517                 phrase = phrase[1:]
   518         elif phrase.startswith('"'):
   519             phrase = phrase.replace('"','{',1)
   520 
   521         if phrase_count != length-1:
   522             if phrase.endswith('"') or phrase.endswith('}'):
   523                 phrase = phrase[:-1]
   524         else:
   525             if phrase.endswith('"'):
   526                 phrase = phrase[:-1]
   527                 phrase = phrase + "}"
   528             elif phrase.endswith('",'):
   529                 phrase = phrase[:-2]
   530                 phrase = phrase + "},"
   531 
   532         # if phrase did have \#, add the \# back
   533         if phrase.endswith('\\'):
   534             phrase = phrase + "#"
   535         concat_line = concat_line + ' ' + phrase
   536 
   537         phrase_count = phrase_count + 1
   538 
   539     return concat_line
   540 
   541 
   542 #
   543 # substitute abbreviations into filecont
   544 # @param filecont_source - string of data from file
   545 #
   546 def bibtex_replace_abbreviations(filecont_source):
   547     filecont = filecont_source.splitlines()
   548 
   549     #  These are defined in bibtex, so we'll define them too
   550     abbr_list = ['jan','feb','mar','apr','may','jun',
   551                  'jul','aug','sep','oct','nov','dec']
   552     value_list = ['January','February','March','April',
   553                   'May','June','July','August','September',
   554                   'October','November','December']
   555 
   556     abbr_rex = []
   557     total_abbr_count = 0
   558 
   559     front = '\\b'
   560     back = '(,?)\\b'
   561 
   562     for x in abbr_list:
   563         abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   564         total_abbr_count = total_abbr_count + 1
   565 
   566 
   567     abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
   568                              re.I)
   569 
   570     comment_rex = re.compile('@comment\s*{',re.I)
   571     preamble_rex = re.compile('@preamble\s*{',re.I)
   572 
   573     waiting_for_end_string = 0
   574     i = 0
   575     filecont2 = ''
   576 
   577     for line in filecont:
   578         if line == ' ' or line == '':
   579             continue
   580 
   581         if waiting_for_end_string:
   582             if re.search('}',line):
   583                 waiting_for_end_string = 0
   584                 continue
   585 
   586         if abbrdef_rex.search(line):
   587             abbr = abbrdef_rex.sub('\g<1>', line)
   588 
   589             if abbr_list.count(abbr) == 0:
   590                 val = abbrdef_rex.sub('\g<2>', line)
   591                 abbr_list.append(abbr)
   592                 value_list.append(string.strip(val))
   593                 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   594                 total_abbr_count = total_abbr_count + 1
   595             waiting_for_end_string = 1
   596             continue
   597 
   598         if comment_rex.search(line):
   599             waiting_for_end_string = 1
   600             continue
   601 
   602         if preamble_rex.search(line):
   603             waiting_for_end_string = 1
   604             continue
   605 
   606 
   607         # replace subsequent abbreviations with the value
   608         abbr_count = 0
   609 
   610         for x in abbr_list:
   611 
   612             if abbr_rex[abbr_count].search(line):
   613                 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
   614                     line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
   615                 # Check for # concatenations
   616                 if concatsplit_rex.search(line):
   617                     line = concat_line(line)
   618             abbr_count = abbr_count + 1
   619 
   620 
   621         filecont2 = filecont2 + line + '\n'
   622         i = i+1
   623 
   624 
   625     # Do one final pass over file
   626 
   627     # make sure that didn't end up with {" or }" after the substitution
   628     filecont2 = filecont2.replace('{"','{{')
   629     filecont2 = filecont2.replace('"}','}}')
   630 
   631     afterquotevalue_rex = re.compile('"\s*,\s*')
   632     afterbrace_rex = re.compile('"\s*}')
   633     afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
   634 
   635     # add new lines to data that changed because of abbreviation substitutions
   636     filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
   637     filecont2 = afterbrace_rex.sub('"\n}', filecont2)
   638     filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
   639 
   640     return filecont2
   641 
   642 #
   643 # convert @type( ... ) to @type{ ... }
   644 #
   645 def no_outer_parens(filecont):
   646 
   647     # do checking for open parens
   648     # will convert to braces
   649     paren_split = re.split('([(){}])',filecont)
   650 
   651     open_paren_count = 0
   652     open_type = 0
   653     look_next = 0
   654 
   655     # rebuild filecont
   656     filecont = ''
   657 
   658     at_rex = re.compile('@\w*')
   659 
   660     for phrase in paren_split:
   661         if look_next == 1:
   662             if phrase == '(':
   663                 phrase = '{'
   664                 open_paren_count = open_paren_count + 1
   665             else:
   666                 open_type = 0
   667             look_next = 0
   668 
   669         if phrase == '(':
   670             open_paren_count = open_paren_count + 1
   671 
   672         elif phrase == ')':
   673             open_paren_count = open_paren_count - 1
   674             if open_type == 1 and open_paren_count == 0:
   675                 phrase = '}'
   676                 open_type = 0
   677 
   678         elif at_rex.search( phrase ):
   679             open_type = 1
   680             look_next = 1
   681 
   682         filecont = filecont + phrase
   683 
   684     return filecont
   685 
   686 
   687 #
   688 # make all whitespace into just one space
   689 # format the bibtex file into a usable form.
   690 #
   691 def bibtexwasher(filecont_source):
   692 
   693     space_rex = re.compile('\s+')
   694     comment_rex = re.compile('\s*%')
   695 
   696     filecont = []
   697 
   698     # remove trailing and excessive whitespace
   699     # ignore comments
   700     for line in filecont_source:
   701         line = string.strip(line)
   702         line = space_rex.sub(' ', line)
   703         # ignore comments
   704         if not comment_rex.match(line) and line != '':
   705             filecont.append(' '+ line)
   706 
   707     filecont = string.join(filecont, '')
   708 
   709     # the file is in one long string
   710 
   711     filecont = no_outer_parens(filecont)
   712 
   713     #
   714     # split lines according to preferred syntax scheme
   715     #
   716     filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
   717 
   718     # add new lines after commas that are after values
   719     filecont = re.sub('"\s*,', '",\n', filecont)
   720     filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
   721     filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
   722                           '\n\n\g<1>\g<2>,\n', filecont)
   723 
   724     # add new lines after }
   725     filecont = re.sub('"\s*}','"\n}\n', filecont)
   726     filecont = re.sub('}\s*,','},\n', filecont)
   727 
   728 
   729     filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
   730 
   731     # character encoding, reserved latex characters
   732     filecont = re.sub('{\\\&}', '&', filecont)
   733     filecont = re.sub('\\\&', '&', filecont)
   734 
   735     # do checking for open braces to get format correct
   736     open_brace_count = 0
   737     brace_split = re.split('([{}])',filecont)
   738 
   739     # rebuild filecont
   740     filecont = ''
   741 
   742     for phrase in brace_split:
   743         if phrase == '{':
   744             open_brace_count = open_brace_count + 1
   745         elif phrase == '}':
   746             open_brace_count = open_brace_count - 1
   747             if open_brace_count == 0:
   748                 filecont = filecont + '\n'
   749 
   750         filecont = filecont + phrase
   751 
   752     filecont2 = bibtex_replace_abbreviations(filecont)
   753 
   754     # gather
   755     filecont = filecont2.splitlines()
   756     i=0
   757     j=0         # count the number of blank lines
   758     for line in filecont:
   759         # ignore blank lines
   760         if line == '' or line == ' ':
   761             j = j+1
   762             continue
   763         filecont[i] = line + '\n'
   764         i = i+1
   765 
   766     # get rid of the extra stuff at the end of the array
   767     # (The extra stuff are duplicates that are in the array because
   768     # blank lines were removed.)
   769     length = len( filecont)
   770     filecont[length-j:length] = []
   771 
   772     return filecont
   773 
   774 
   775 def filehandler(filepath):
   776     try:
   777         fd = open(filepath, 'r')
   778         filecont_source = fd.readlines()
   779         fd.close()
   780     except:
   781         print 'Could not open file:', filepath
   782     washeddata = bibtexwasher(filecont_source)
   783     outdata = bibtexdecoder(washeddata)
   784     print '/**'
   785     print '\page references References'
   786     print
   787     for line in outdata:
   788         print line
   789     print '*/'
   790 
   791 
   792 # main program
   793 
   794 def main():
   795     import sys
   796     if sys.argv[1:]:
   797         filepath = sys.argv[1]
   798     else:
   799         print "No input file"
   800         sys.exit()
   801     filehandler(filepath)
   802 
   803 if __name__ == "__main__": main()
   804 
   805 
   806 # end python script