scripts/bib2dox.py
author Balazs Dezso <deba@inf.elte.hu>
Sun, 14 Feb 2010 23:14:09 +0100
changeset 902 d2bc45e8f6f2
parent 792 68792fb2870f
child 905 c841ae1aca29
permissions -rwxr-xr-x
Merge bugfix #337
     1 #!/usr/bin/env /usr/local/Python/bin/python2.1
     2 """
     3   BibTeX to Doxygen converter
     4   Usage: python bib2dox.py bibfile.bib > bibfile.dox
     5 
     6   This code is the modification of the BibTeX to XML converter
     7   by Vidar Bronken Gundersen et al. See the original copyright notices below. 
     8 
     9   **********************************************************************
    10 
    11   Decoder for bibliographic data, BibTeX
    12   Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
    13 
    14   v.8
    15   (c)2002-06-23 Vidar Bronken Gundersen
    16   http://bibtexml.sf.net/
    17   Reuse approved as long as this notification is kept.
    18   Licence: GPL.
    19 
    20   Contributions/thanks to:
    21   Egon Willighagen, http://sf.net/projects/jreferences/
    22   Richard Mahoney (for providing a test case)
    23 
    24   Editted by Sara Sprenkle to be more robust and handle more bibtex features.
    25   (c) 2003-01-15
    26 
    27   1.  Changed bibtex: tags to bibxml: tags.
    28   2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
    29   3.  Allow spaces between @type and first {
    30   4.  "author" fields with multiple authors split by " and "
    31       are put in separate xml "bibxml:author" tags.
    32   5.  Option for Titles: words are capitalized
    33       only if first letter in title or capitalized inside braces
    34   6.  Removes braces from within field values
    35   7.  Ignores comments in bibtex file (including @comment{ or % )
    36   8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'
    37   9.  Handles bibtex @string abbreviations
    38         --> includes bibtex's default abbreviations for months
    39         --> does concatenation of abbr # " more " and " more " # abbr
    40   10. Handles @type( ... ) or @type{ ... }
    41   11. The keywords field is split on , or ; and put into separate xml
    42       "bibxml:keywords" tags
    43   12. Ignores @preamble
    44 
    45   Known Limitations
    46   1.  Does not transform Latex encoding like math mode and special
    47       latex symbols.
    48   2.  Does not parse author fields into first and last names.
    49       E.g., It does not do anything special to an author whose name is
    50       in the form LAST_NAME, FIRST_NAME
    51       In "author" tag, will show up as
    52       <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
    53   3.  Does not handle "crossref" fields other than to print
    54       <bibxml:crossref>...</bibxml:crossref>
    55   4.  Does not inform user of the input's format errors.  You just won't
    56       be able to transform the file later with XSL
    57 
    58   You will have to manually edit the XML output if you need to handle
    59   these (and unknown) limitations.
    60 
    61 """
    62 
    63 import string, re
    64 
    65 # set of valid name characters
    66 valid_name_chars = '[\w\-:]'
    67 
    68 #
    69 # define global regular expression variables
    70 #
    71 author_rex = re.compile('\s+and\s+')
    72 rembraces_rex = re.compile('[{}]')
    73 capitalize_rex = re.compile('({[^}]*})')
    74 
    75 # used by bibtexkeywords(data)
    76 keywords_rex = re.compile('[,;]')
    77 
    78 # used by concat_line(line)
    79 concatsplit_rex = re.compile('\s*#\s*')
    80 
    81 # split on {, }, or " in verify_out_of_braces
    82 delimiter_rex = re.compile('([{}"])',re.I)
    83 
    84 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
    85 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
    86 
    87 url_rex = re.compile('\\\url\{([^}]*)\}')
    88 
    89 #
    90 # styles for html formatting
    91 #
    92 divstyle = 'margin-top: -4ex; margin-left: 8em;'
    93 
    94 #
    95 # return the string parameter without braces
    96 #
    97 def transformurls(str):
    98     return url_rex.sub(r'<a href="\1">\1</a>', str)
    99 
   100 #
   101 # return the string parameter without braces
   102 #
   103 def removebraces(str):
   104     return rembraces_rex.sub('', str)
   105 
   106 #
   107 # latex-specific replacements
   108 # (do this after braces were removed)
   109 #
   110 def latexreplacements(line):
   111     line = string.replace(line, '~', '&nbsp;')
   112     line = string.replace(line, '\\\'a', '&aacute;')
   113     line = string.replace(line, '\\"a', '&auml;')
   114     line = string.replace(line, '\\\'e', '&eacute;')
   115     line = string.replace(line, '\\"e', '&euml;')
   116     line = string.replace(line, '\\\'i', '&iacute;')
   117     line = string.replace(line, '\\"i', '&iuml;')
   118     line = string.replace(line, '\\\'o', '&oacute;')
   119     line = string.replace(line, '\\"o', '&ouml;')
   120     line = string.replace(line, '\\\'u', '&uacute;')
   121     line = string.replace(line, '\\"u', '&uuml;')
   122     line = string.replace(line, '\\H o', '&otilde;')
   123     line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
   124     line = string.replace(line, '\\\'A', '&Aacute;')
   125     line = string.replace(line, '\\"A', '&Auml;')
   126     line = string.replace(line, '\\\'E', '&Eacute;')
   127     line = string.replace(line, '\\"E', '&Euml;')
   128     line = string.replace(line, '\\\'I', '&Iacute;')
   129     line = string.replace(line, '\\"I', '&Iuml;')
   130     line = string.replace(line, '\\\'O', '&Oacute;')
   131     line = string.replace(line, '\\"O', '&Ouml;')
   132     line = string.replace(line, '\\\'U', '&Uacute;')
   133     line = string.replace(line, '\\"U', '&Uuml;')
   134     line = string.replace(line, '\\H O', '&Otilde;')
   135     line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
   136 
   137     return line
   138 
   139 #
   140 # copy characters form a string decoding html expressions (&xyz;)
   141 #
   142 def copychars(str, ifrom, count):
   143     result = ''
   144     i = ifrom
   145     c = 0
   146     html_spec = False
   147     while (i < len(str)) and (c < count):
   148         if str[i] == '&':
   149             html_spec = True;
   150             if i+1 < len(str):
   151                 result += str[i+1]
   152             c += 1
   153             i += 2
   154         else:
   155             if not html_spec:
   156                 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
   157                    ((str[i] >= 'a') and (str[i] <= 'z')):
   158                     result += str[i]
   159                     c += 1
   160             elif str[i] == ';':
   161                 html_spec = False;
   162             i += 1
   163     
   164     return result
   165 
   166 
   167 # 
   168 # Handle a list of authors (separated by 'and').
   169 # It gives back an array of the follwing values:
   170 #  - num: the number of authors,
   171 #  - list: the list of the author names,
   172 #  - text: the bibtex text (separated by commas and/or 'and')
   173 #  - abbrev: abbreviation that can be used for indicate the
   174 #    bibliography entries
   175 #
   176 def bibtexauthor(data):
   177     result = {}
   178     bibtex = ''
   179     result['list'] = author_rex.split(data)
   180     result['num'] = len(result['list'])
   181     for i, author in enumerate(result['list']):
   182         # general transformations
   183         author = latexreplacements(removebraces(author.strip()))
   184         # transform "Xyz, A. B." to "A. B. Xyz"
   185         pos = author.find(',')
   186         if pos != -1:
   187             author = author[pos+1:].strip() + ' ' + author[:pos].strip()
   188         result['list'][i] = author
   189         bibtex += author + '#'
   190     bibtex = bibtex[:-1]
   191     if result['num'] > 1:
   192         ix = bibtex.rfind('#')
   193         if result['num'] == 2:
   194             bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
   195         else:
   196             bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
   197     bibtex = bibtex.replace('#', ', ')
   198     result['text'] = bibtex
   199     
   200     result['abbrev'] = ''
   201     for author in result['list']:
   202         pos = author.rfind(' ') + 1
   203         count = 1
   204         if result['num'] == 1:
   205             count = 3
   206         result['abbrev'] += copychars(author, pos, count)
   207 
   208     return result
   209 
   210 
   211 #
   212 # data = title string
   213 # @return the capitalized title (first letter is capitalized), rest are capitalized
   214 # only if capitalized inside braces
   215 #
   216 def capitalizetitle(data):
   217     title_list = capitalize_rex.split(data)
   218     title = ''
   219     count = 0
   220     for phrase in title_list:
   221          check = string.lstrip(phrase)
   222 
   223          # keep phrase's capitalization the same
   224          if check.find('{') == 0:
   225               title += removebraces(phrase)
   226          else:
   227          # first word --> capitalize first letter (after spaces)
   228               if count == 0:
   229                   title += check.capitalize()
   230               else:
   231                   title += phrase.lower()
   232          count = count + 1
   233 
   234     return title
   235 
   236 
   237 #
   238 # @return the bibtex for the title
   239 # @param data --> title string
   240 # braces are removed from title
   241 #
   242 def bibtextitle(data, entrytype):
   243     if entrytype in ('book', 'inbook'):
   244         title = removebraces(data.strip())
   245     else:
   246         title = removebraces(capitalizetitle(data.strip()))
   247     bibtex = title
   248     return bibtex
   249 
   250 
   251 #
   252 # function to compare entry lists
   253 #
   254 def entry_cmp(x, y):
   255     return cmp(x[0], y[0])
   256 
   257 
   258 #
   259 # print the XML for the transformed "filecont_source"
   260 #
   261 def bibtexdecoder(filecont_source):
   262     filecont = []
   263     file = []
   264     
   265     # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   266     pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
   267     endtype_rex = re.compile('}\s*$')
   268     endtag_rex = re.compile('^\s*}\s*$')
   269 
   270     bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   271     bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
   272 
   273     quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   274     quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
   275 
   276     for line in filecont_source:
   277         line = line[:-1]
   278 
   279         # encode character entities
   280         line = string.replace(line, '&', '&amp;')
   281         line = string.replace(line, '<', '&lt;')
   282         line = string.replace(line, '>', '&gt;')
   283 
   284         # start entry: publication type (store for later use)
   285         if pubtype_rex.match(line):
   286         # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   287             entrycont = {}
   288             entry = []
   289             entrytype = pubtype_rex.sub('\g<1>',line)
   290             entrytype = string.lower(entrytype)
   291             entryid   = pubtype_rex.sub('\g<2>', line)
   292 
   293         # end entry if just a }
   294         elif endtype_rex.match(line):
   295             # generate doxygen code for the entry
   296 
   297             # enty type related formattings
   298             if entrytype in ('book', 'inbook'):
   299                 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
   300                 if not entrycont.has_key('author'):
   301                     entrycont['author'] = entrycont['editor']
   302                     entrycont['author']['text'] += ', editors'
   303             elif entrytype == 'article':
   304                 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
   305             elif entrytype in ('inproceedings', 'incollection', 'conference'):
   306                 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
   307             elif entrytype == 'techreport':
   308                 if not entrycont.has_key('type'):
   309                     entrycont['type'] = 'Technical report'
   310             elif entrytype == 'mastersthesis':
   311                 entrycont['type'] = 'Master\'s thesis'
   312             elif entrytype == 'phdthesis':
   313                 entrycont['type'] = 'PhD thesis'
   314 
   315             for eline in entrycont:
   316                 if eline != '':
   317                     eline = latexreplacements(eline)
   318 
   319             if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   320                 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
   321 
   322             if entrycont.has_key('author') and (entrycont['author'] != ''):
   323                 entry.append(entrycont['author']['text'] + '.')
   324             if entrycont.has_key('title') and (entrycont['title'] != ''):
   325                 entry.append(entrycont['title'] + '.')
   326             if entrycont.has_key('journal') and (entrycont['journal'] != ''):
   327                 entry.append(entrycont['journal'] + ',')
   328             if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
   329                 entry.append('In ' + entrycont['booktitle'] + ',')
   330             if entrycont.has_key('type') and (entrycont['type'] != ''):
   331                 eline = entrycont['type']
   332                 if entrycont.has_key('number') and (entrycont['number'] != ''):
   333                     eline += ' ' + entrycont['number']
   334                 eline += ','
   335                 entry.append(eline)
   336             if entrycont.has_key('institution') and (entrycont['institution'] != ''):
   337                 entry.append(entrycont['institution'] + ',')
   338             if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
   339                 entry.append(entrycont['publisher'] + ',')
   340             if entrycont.has_key('school') and (entrycont['school'] != ''):
   341                 entry.append(entrycont['school'] + ',')
   342             if entrycont.has_key('address') and (entrycont['address'] != ''):
   343                 entry.append(entrycont['address'] + ',')
   344             if entrycont.has_key('edition') and (entrycont['edition'] != ''):
   345                 entry.append(entrycont['edition'] + ' edition,')
   346             if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
   347                 entry.append(entrycont['howpublished'] + ',')
   348             if entrycont.has_key('volume') and (entrycont['volume'] != ''):
   349                 eline = entrycont['volume'];
   350                 if entrycont.has_key('number') and (entrycont['number'] != ''):
   351                     eline += '(' + entrycont['number'] + ')'
   352                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   353                     eline += ':' + entrycont['pages']
   354                 eline += ','
   355                 entry.append(eline)
   356             else:
   357                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   358                     entry.append('pages ' + entrycont['pages'] + ',')
   359             if entrycont.has_key('year') and (entrycont['year'] != ''):
   360                 if entrycont.has_key('month') and (entrycont['month'] != ''):
   361                     entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
   362                 else:
   363                     entry.append(entrycont['year'] + '.')
   364             if entrycont.has_key('note') and (entrycont['note'] != ''):
   365                 entry.append(entrycont['note'] + '.')
   366             if entrycont.has_key('url') and (entrycont['url'] != ''):
   367                 entry.append(entrycont['url'] + '.')
   368 
   369             # generate keys for sorting and for the output
   370             sortkey = ''
   371             bibkey = ''
   372             if entrycont.has_key('author'):
   373                 for author in entrycont['author']['list']:
   374                     sortkey += copychars(author, author.rfind(' ')+1, len(author))
   375                 bibkey = entrycont['author']['abbrev']
   376             else:
   377                 bibkey = 'x'
   378             if entrycont.has_key('year'):
   379                 sortkey += entrycont['year']
   380                 bibkey += entrycont['year'][-2:]
   381             if entrycont.has_key('title'):
   382                 sortkey += entrycont['title']
   383             if entrycont.has_key('key'):
   384                 sortkey = entrycont['key'] + sortkey
   385                 bibkey = entrycont['key']
   386             entry.insert(0, sortkey)
   387             entry.insert(1, bibkey)
   388             entry.insert(2, entryid)
   389            
   390             # add the entry to the file contents
   391             filecont.append(entry)
   392 
   393         else:
   394             # field, publication info
   395             field = ''
   396             data = ''
   397             
   398             # field = {data} entries
   399             if bracedata_rex.match(line):
   400                 field = bracefield_rex.sub('\g<1>', line)
   401                 field = string.lower(field)
   402                 data =  bracedata_rex.sub('\g<2>', line)
   403 
   404             # field = "data" entries
   405             elif quotedata_rex.match(line):
   406                 field = quotefield_rex.sub('\g<1>', line)
   407                 field = string.lower(field)
   408                 data =  quotedata_rex.sub('\g<2>', line)
   409 
   410             # field = data entries
   411             elif data_rex.match(line):
   412                 field = field_rex.sub('\g<1>', line)
   413                 field = string.lower(field)
   414                 data =  data_rex.sub('\g<2>', line)
   415 
   416             if field == 'url':
   417                 data = '\\url{' + data.strip() + '}'
   418             
   419             if field in ('author', 'editor'):
   420                 entrycont[field] = bibtexauthor(data)
   421                 line = ''
   422             elif field == 'title':
   423                 line = bibtextitle(data, entrytype)
   424             elif field != '':
   425                 line = removebraces(transformurls(data.strip()))
   426 
   427             if line != '':
   428                 line = latexreplacements(line)
   429                 entrycont[field] = line
   430 
   431 
   432     # sort entries
   433     filecont.sort(entry_cmp)
   434     
   435     # count the bibtex keys
   436     keytable = {}
   437     counttable = {}
   438     for entry in filecont:
   439         bibkey = entry[1]
   440         if not keytable.has_key(bibkey):
   441             keytable[bibkey] = 1
   442         else:
   443             keytable[bibkey] += 1
   444 
   445     for bibkey in keytable.keys():
   446         counttable[bibkey] = 0
   447     
   448     # generate output
   449     for entry in filecont:
   450         # generate output key form the bibtex key
   451         bibkey = entry[1]
   452         entryid = entry[2]
   453         if keytable[bibkey] == 1:
   454             outkey = bibkey
   455         else:
   456             outkey = bibkey + chr(97 + counttable[bibkey])
   457         counttable[bibkey] += 1
   458         
   459         # append the entry code to the output
   460         file.append('\\section ' + entryid + ' [' + outkey + ']')
   461         file.append('<div style="' + divstyle + '">')
   462         for line in entry[3:]:
   463             file.append(line)
   464         file.append('</div>')
   465         file.append('')
   466 
   467     return file
   468 
   469 
   470 #
   471 # return 1 iff abbr is in line but not inside braces or quotes
   472 # assumes that abbr appears only once on the line (out of braces and quotes)
   473 #
   474 def verify_out_of_braces(line, abbr):
   475 
   476     phrase_split = delimiter_rex.split(line)
   477 
   478     abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
   479 
   480     open_brace = 0
   481     open_quote = 0
   482 
   483     for phrase in phrase_split:
   484         if phrase == "{":
   485             open_brace = open_brace + 1
   486         elif phrase == "}":
   487             open_brace = open_brace - 1
   488         elif phrase == '"':
   489             if open_quote == 1:
   490                 open_quote = 0
   491             else:
   492                 open_quote = 1
   493         elif abbr_rex.search(phrase):
   494             if open_brace == 0 and open_quote == 0:
   495                 return 1
   496 
   497     return 0
   498 
   499 
   500 #
   501 # a line in the form phrase1 # phrase2 # ... # phrasen
   502 # is returned as phrase1 phrase2 ... phrasen
   503 # with the correct punctuation
   504 # Bug: Doesn't always work with multiple abbreviations plugged in
   505 #
   506 def concat_line(line):
   507     # only look at part after equals
   508     field = field_rex.sub('\g<1>',line)
   509     rest = field_rex.sub('\g<2>',line)
   510 
   511     concat_line = field + ' ='
   512 
   513     pound_split = concatsplit_rex.split(rest)
   514 
   515     phrase_count = 0
   516     length = len(pound_split)
   517 
   518     for phrase in pound_split:
   519         phrase = phrase.strip()
   520         if phrase_count != 0:
   521             if phrase.startswith('"') or phrase.startswith('{'):
   522                 phrase = phrase[1:]
   523         elif phrase.startswith('"'):
   524             phrase = phrase.replace('"','{',1)
   525 
   526         if phrase_count != length-1:
   527             if phrase.endswith('"') or phrase.endswith('}'):
   528                 phrase = phrase[:-1]
   529         else:
   530             if phrase.endswith('"'):
   531                 phrase = phrase[:-1]
   532                 phrase = phrase + "}"
   533             elif phrase.endswith('",'):
   534                 phrase = phrase[:-2]
   535                 phrase = phrase + "},"
   536 
   537         # if phrase did have \#, add the \# back
   538         if phrase.endswith('\\'):
   539             phrase = phrase + "#"
   540         concat_line = concat_line + ' ' + phrase
   541 
   542         phrase_count = phrase_count + 1
   543 
   544     return concat_line
   545 
   546 
   547 #
   548 # substitute abbreviations into filecont
   549 # @param filecont_source - string of data from file
   550 #
   551 def bibtex_replace_abbreviations(filecont_source):
   552     filecont = filecont_source.splitlines()
   553 
   554     #  These are defined in bibtex, so we'll define them too
   555     abbr_list = ['jan','feb','mar','apr','may','jun',
   556                  'jul','aug','sep','oct','nov','dec']
   557     value_list = ['January','February','March','April',
   558                   'May','June','July','August','September',
   559                   'October','November','December']
   560 
   561     abbr_rex = []
   562     total_abbr_count = 0
   563 
   564     front = '\\b'
   565     back = '(,?)\\b'
   566 
   567     for x in abbr_list:
   568         abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   569         total_abbr_count = total_abbr_count + 1
   570 
   571 
   572     abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
   573                              re.I)
   574 
   575     comment_rex = re.compile('@comment\s*{',re.I)
   576     preamble_rex = re.compile('@preamble\s*{',re.I)
   577 
   578     waiting_for_end_string = 0
   579     i = 0
   580     filecont2 = ''
   581 
   582     for line in filecont:
   583         if line == ' ' or line == '':
   584             continue
   585 
   586         if waiting_for_end_string:
   587             if re.search('}',line):
   588                 waiting_for_end_string = 0
   589                 continue
   590 
   591         if abbrdef_rex.search(line):
   592             abbr = abbrdef_rex.sub('\g<1>', line)
   593 
   594             if abbr_list.count(abbr) == 0:
   595                 val = abbrdef_rex.sub('\g<2>', line)
   596                 abbr_list.append(abbr)
   597                 value_list.append(string.strip(val))
   598                 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   599                 total_abbr_count = total_abbr_count + 1
   600             waiting_for_end_string = 1
   601             continue
   602 
   603         if comment_rex.search(line):
   604             waiting_for_end_string = 1
   605             continue
   606 
   607         if preamble_rex.search(line):
   608             waiting_for_end_string = 1
   609             continue
   610 
   611 
   612         # replace subsequent abbreviations with the value
   613         abbr_count = 0
   614 
   615         for x in abbr_list:
   616 
   617             if abbr_rex[abbr_count].search(line):
   618                 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
   619                     line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
   620                 # Check for # concatenations
   621                 if concatsplit_rex.search(line):
   622                     line = concat_line(line)
   623             abbr_count = abbr_count + 1
   624 
   625 
   626         filecont2 = filecont2 + line + '\n'
   627         i = i+1
   628 
   629 
   630     # Do one final pass over file
   631 
   632     # make sure that didn't end up with {" or }" after the substitution
   633     filecont2 = filecont2.replace('{"','{{')
   634     filecont2 = filecont2.replace('"}','}}')
   635 
   636     afterquotevalue_rex = re.compile('"\s*,\s*')
   637     afterbrace_rex = re.compile('"\s*}')
   638     afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
   639 
   640     # add new lines to data that changed because of abbreviation substitutions
   641     filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
   642     filecont2 = afterbrace_rex.sub('"\n}', filecont2)
   643     filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
   644 
   645     return filecont2
   646 
   647 #
   648 # convert @type( ... ) to @type{ ... }
   649 #
   650 def no_outer_parens(filecont):
   651 
   652     # do checking for open parens
   653     # will convert to braces
   654     paren_split = re.split('([(){}])',filecont)
   655 
   656     open_paren_count = 0
   657     open_type = 0
   658     look_next = 0
   659 
   660     # rebuild filecont
   661     filecont = ''
   662 
   663     at_rex = re.compile('@\w*')
   664 
   665     for phrase in paren_split:
   666         if look_next == 1:
   667             if phrase == '(':
   668                 phrase = '{'
   669                 open_paren_count = open_paren_count + 1
   670             else:
   671                 open_type = 0
   672             look_next = 0
   673 
   674         if phrase == '(':
   675             open_paren_count = open_paren_count + 1
   676 
   677         elif phrase == ')':
   678             open_paren_count = open_paren_count - 1
   679             if open_type == 1 and open_paren_count == 0:
   680                 phrase = '}'
   681                 open_type = 0
   682 
   683         elif at_rex.search( phrase ):
   684             open_type = 1
   685             look_next = 1
   686 
   687         filecont = filecont + phrase
   688 
   689     return filecont
   690 
   691 
   692 #
   693 # make all whitespace into just one space
   694 # format the bibtex file into a usable form.
   695 #
   696 def bibtexwasher(filecont_source):
   697 
   698     space_rex = re.compile('\s+')
   699     comment_rex = re.compile('\s*%')
   700 
   701     filecont = []
   702 
   703     # remove trailing and excessive whitespace
   704     # ignore comments
   705     for line in filecont_source:
   706         line = string.strip(line)
   707         line = space_rex.sub(' ', line)
   708         # ignore comments
   709         if not comment_rex.match(line) and line != '':
   710             filecont.append(' '+ line)
   711 
   712     filecont = string.join(filecont, '')
   713 
   714     # the file is in one long string
   715 
   716     filecont = no_outer_parens(filecont)
   717 
   718     #
   719     # split lines according to preferred syntax scheme
   720     #
   721     filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
   722 
   723     # add new lines after commas that are after values
   724     filecont = re.sub('"\s*,', '",\n', filecont)
   725     filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
   726     filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
   727                           '\n\n\g<1>\g<2>,\n', filecont)
   728 
   729     # add new lines after }
   730     filecont = re.sub('"\s*}','"\n}\n', filecont)
   731     filecont = re.sub('}\s*,','},\n', filecont)
   732 
   733 
   734     filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
   735 
   736     # character encoding, reserved latex characters
   737     filecont = re.sub('{\\\&}', '&', filecont)
   738     filecont = re.sub('\\\&', '&', filecont)
   739 
   740     # do checking for open braces to get format correct
   741     open_brace_count = 0
   742     brace_split = re.split('([{}])',filecont)
   743 
   744     # rebuild filecont
   745     filecont = ''
   746 
   747     for phrase in brace_split:
   748         if phrase == '{':
   749             open_brace_count = open_brace_count + 1
   750         elif phrase == '}':
   751             open_brace_count = open_brace_count - 1
   752             if open_brace_count == 0:
   753                 filecont = filecont + '\n'
   754 
   755         filecont = filecont + phrase
   756 
   757     filecont2 = bibtex_replace_abbreviations(filecont)
   758 
   759     # gather
   760     filecont = filecont2.splitlines()
   761     i=0
   762     j=0         # count the number of blank lines
   763     for line in filecont:
   764         # ignore blank lines
   765         if line == '' or line == ' ':
   766             j = j+1
   767             continue
   768         filecont[i] = line + '\n'
   769         i = i+1
   770 
   771     # get rid of the extra stuff at the end of the array
   772     # (The extra stuff are duplicates that are in the array because
   773     # blank lines were removed.)
   774     length = len( filecont)
   775     filecont[length-j:length] = []
   776 
   777     return filecont
   778 
   779 
   780 def filehandler(filepath):
   781     try:
   782         fd = open(filepath, 'r')
   783         filecont_source = fd.readlines()
   784         fd.close()
   785     except:
   786         print 'Could not open file:', filepath
   787     washeddata = bibtexwasher(filecont_source)
   788     outdata = bibtexdecoder(washeddata)
   789     print '/**'
   790     print '\page references References'
   791     print
   792     for line in outdata:
   793         print line
   794     print '*/'
   795 
   796 
   797 # main program
   798 
   799 def main():
   800     import sys
   801     if sys.argv[1:]:
   802         filepath = sys.argv[1]
   803     else:
   804         print "No input file"
   805         sys.exit()
   806     filehandler(filepath)
   807 
   808 if __name__ == "__main__": main()
   809 
   810 
   811 # end python script