scripts/bib2dox.py
changeset 791 f8c468367dab
child 792 68792fb2870f
equal deleted inserted replaced
-1:000000000000 0:fd2ab6d67a38
       
     1 #!/usr/bin/env /usr/local/Python/bin/python2.1
       
     2 """
       
     3   BibTeX to Doxygen converter
       
     4   Usage: python bib2dox.py bibfile.bib > bibfile.dox
       
     5 
       
     6   This code is the modification of the BibTeX to XML converter
       
     7   by Vidar Bronken Gundersen et al. See the original copyright notices below. 
       
     8 
       
     9   **********************************************************************
       
    10 
       
    11   Decoder for bibliographic data, BibTeX
       
    12   Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
       
    13 
       
    14   v.8
       
    15   (c)2002-06-23 Vidar Bronken Gundersen
       
    16   http://bibtexml.sf.net/
       
    17   Reuse approved as long as this notification is kept.
       
    18   Licence: GPL.
       
    19 
       
    20   Contributions/thanks to:
       
    21   Egon Willighagen, http://sf.net/projects/jreferences/
       
    22   Richard Mahoney (for providing a test case)
       
    23 
       
    24   Editted by Sara Sprenkle to be more robust and handle more bibtex features.
       
    25   (c) 2003-01-15
       
    26 
       
    27   1.  Changed bibtex: tags to bibxml: tags.
       
    28   2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
       
    29   3.  Allow spaces between @type and first {
       
    30   4.  "author" fields with multiple authors split by " and "
       
    31       are put in separate xml "bibxml:author" tags.
       
    32   5.  Option for Titles: words are capitalized
       
    33       only if first letter in title or capitalized inside braces
       
    34   6.  Removes braces from within field values
       
    35   7.  Ignores comments in bibtex file (including @comment{ or % )
       
    36   8.  Replaces some special latex tags, e.g., replaces ~ with ' '
       
    37   9.  Handles bibtex @string abbreviations
       
    38         --> includes bibtex's default abbreviations for months
       
    39         --> does concatenation of abbr # " more " and " more " # abbr
       
    40   10. Handles @type( ... ) or @type{ ... }
       
    41   11. The keywords field is split on , or ; and put into separate xml
       
    42       "bibxml:keywords" tags
       
    43   12. Ignores @preamble
       
    44 
       
    45   Known Limitations
       
    46   1.  Does not transform Latex encoding like math mode and special
       
    47       latex symbols.
       
    48   2.  Does not parse author fields into first and last names.
       
    49       E.g., It does not do anything special to an author whose name is
       
    50       in the form LAST_NAME, FIRST_NAME
       
    51       In "author" tag, will show up as
       
    52       <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
       
    53   3.  Does not handle "crossref" fields other than to print
       
    54       <bibxml:crossref>...</bibxml:crossref>
       
    55   4.  Does not inform user of the input's format errors.  You just won't
       
    56       be able to transform the file later with XSL
       
    57 
       
    58   You will have to manually edit the XML output if you need to handle
       
    59   these (and unknown) limitations.
       
    60 
       
    61 """
       
    62 
       
    63 import string, re
       
    64 
       
    65 # set of valid name characters
       
    66 valid_name_chars = '[\w\-:]'
       
    67 
       
    68 #
       
    69 # define global regular expression variables
       
    70 #
       
    71 author_rex = re.compile('\s+and\s+')
       
    72 rembraces_rex = re.compile('[{}]')
       
    73 capitalize_rex = re.compile('({\w*})')
       
    74 
       
    75 # used by bibtexkeywords(data)
       
    76 keywords_rex = re.compile('[,;]')
       
    77 
       
    78 # used by concat_line(line)
       
    79 concatsplit_rex = re.compile('\s*#\s*')
       
    80 
       
    81 # split on {, }, or " in verify_out_of_braces
       
    82 delimiter_rex = re.compile('([{}"])',re.I)
       
    83 
       
    84 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
       
    85 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
       
    86 
       
    87 url_rex = re.compile('\\\url\{([^}]*)\}')
       
    88 
       
    89 
       
    90 #
       
    91 # return the string parameter without braces
       
    92 #
       
    93 def transformurls(str):
       
    94     return url_rex.sub(r'<a href="\1">\1</a>', str)
       
    95 
       
    96 #
       
    97 # return the string parameter without braces
       
    98 #
       
    99 def removebraces(str):
       
   100     return rembraces_rex.sub('', str)
       
   101 
       
   102 #
       
   103 # latex-specific replacements
       
   104 # (do this after braces were removed)
       
   105 #
       
   106 def latexreplacements(line):
       
   107     line = string.replace(line, '~', '&nbsp;')
       
   108     line = string.replace(line, '\\\'a', '&aacute;')
       
   109     line = string.replace(line, '\\"a', '&auml;')
       
   110     line = string.replace(line, '\\\'e', '&eacute;')
       
   111     line = string.replace(line, '\\"e', '&euml;')
       
   112     line = string.replace(line, '\\\'i', '&iacute;')
       
   113     line = string.replace(line, '\\"i', '&iuml;')
       
   114     line = string.replace(line, '\\\'o', '&oacute;')
       
   115     line = string.replace(line, '\\"o', '&ouml;')
       
   116     line = string.replace(line, '\\\'u', '&uacute;')
       
   117     line = string.replace(line, '\\"u', '&uuml;')
       
   118     line = string.replace(line, '\\H o', '&otilde;')
       
   119     line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
       
   120     line = string.replace(line, '\\\'A', '&Aacute;')
       
   121     line = string.replace(line, '\\"A', '&Auml;')
       
   122     line = string.replace(line, '\\\'E', '&Eacute;')
       
   123     line = string.replace(line, '\\"E', '&Euml;')
       
   124     line = string.replace(line, '\\\'I', '&Iacute;')
       
   125     line = string.replace(line, '\\"I', '&Iuml;')
       
   126     line = string.replace(line, '\\\'O', '&Oacute;')
       
   127     line = string.replace(line, '\\"O', '&Ouml;')
       
   128     line = string.replace(line, '\\\'U', '&Uacute;')
       
   129     line = string.replace(line, '\\"U', '&Uuml;')
       
   130     line = string.replace(line, '\\H O', '&Otilde;')
       
   131     line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
       
   132 
       
   133     return line
       
   134 
       
   135 #
       
   136 # copy characters form a string decoding html expressions (&xyz;)
       
   137 #
       
   138 def copychars(str, ifrom, count):
       
   139     result = ''
       
   140     i = ifrom
       
   141     c = 0
       
   142     html_spec = False
       
   143     while (i < len(str)) and (c < count):
       
   144         if str[i] == '&':
       
   145             html_spec = True;
       
   146             if i+1 < len(str):
       
   147                 result += str[i+1]
       
   148             c += 1
       
   149             i += 2
       
   150         else:
       
   151             if not html_spec:
       
   152                 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
       
   153                    ((str[i] >= 'a') and (str[i] <= 'z')):
       
   154                     result += str[i]
       
   155                     c += 1
       
   156             elif str[i] == ';':
       
   157                 html_spec = False;
       
   158             i += 1
       
   159     
       
   160     return result
       
   161 
       
   162 
       
   163 # 
       
   164 # Handle a list of authors (separated by 'and').
       
   165 # It gives back an array of the follwing values:
       
   166 #  - num: the number of authors,
       
   167 #  - list: the list of the author names,
       
   168 #  - text: the bibtex text (separated by commas and/or 'and')
       
   169 #  - abbrev: abbreviation that can be used for indicate the
       
   170 #    bibliography entries
       
   171 #
       
   172 def bibtexauthor(data):
       
   173     result = {}
       
   174     bibtex = ''
       
   175     result['list'] = author_rex.split(data)
       
   176     result['num'] = len(result['list'])
       
   177     for i, author in enumerate(result['list']):
       
   178         # general transformations
       
   179         author = latexreplacements(removebraces(author.strip()))
       
   180         # transform "Xyz, A. B." to "A. B. Xyz"
       
   181         pos = author.find(',')
       
   182         if pos != -1:
       
   183             author = author[pos+1:].strip() + ' ' + author[:pos].strip()
       
   184         result['list'][i] = author
       
   185         bibtex += author + '#'
       
   186     bibtex = bibtex[:-1]
       
   187     if result['num'] > 1:
       
   188         ix = bibtex.rfind('#')
       
   189         if result['num'] == 2:
       
   190             bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
       
   191         else:
       
   192             bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
       
   193     bibtex = bibtex.replace('#', ', ')
       
   194     result['text'] = bibtex
       
   195     
       
   196     result['abbrev'] = ''
       
   197     for author in result['list']:
       
   198         pos = author.rfind(' ') + 1
       
   199         count = 1
       
   200         if result['num'] == 1:
       
   201             count = 3
       
   202         result['abbrev'] += copychars(author, pos, count)
       
   203 
       
   204     return result
       
   205 
       
   206 
       
   207 #
       
   208 # data = title string
       
   209 # @return the capitalized title (first letter is capitalized), rest are capitalized
       
   210 # only if capitalized inside braces
       
   211 #
       
   212 def capitalizetitle(data):
       
   213     title_list = capitalize_rex.split(data)
       
   214     title = ''
       
   215     count = 0
       
   216     for phrase in title_list:
       
   217          check = string.lstrip(phrase)
       
   218 
       
   219          # keep phrase's capitalization the same
       
   220          if check.find('{') == 0:
       
   221               title += removebraces(phrase)
       
   222          else:
       
   223          # first word --> capitalize first letter (after spaces)
       
   224               if count == 0:
       
   225                   title += check.capitalize()
       
   226               else:
       
   227                   title += phrase.lower()
       
   228          count = count + 1
       
   229 
       
   230     return title
       
   231 
       
   232 
       
   233 #
       
   234 # @return the bibtex for the title
       
   235 # @param data --> title string
       
   236 # braces are removed from title
       
   237 #
       
   238 def bibtextitle(data, entrytype):
       
   239     if entrytype in ('book', 'inbook'):
       
   240         title = removebraces(data.strip())
       
   241     else:
       
   242         title = removebraces(capitalizetitle(data.strip()))
       
   243     bibtex = title
       
   244     return bibtex
       
   245 
       
   246 
       
   247 #
       
   248 # function to compare entry lists
       
   249 #
       
   250 def entry_cmp(x, y):
       
   251     return cmp(x[0], y[0])
       
   252 
       
   253 
       
   254 #
       
   255 # print the XML for the transformed "filecont_source"
       
   256 #
       
   257 def bibtexdecoder(filecont_source):
       
   258     filecont = []
       
   259     file = []
       
   260     
       
   261     # want @<alphanumeric chars><spaces>{<spaces><any chars>,
       
   262     pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
       
   263     endtype_rex = re.compile('}\s*$')
       
   264     endtag_rex = re.compile('^\s*}\s*$')
       
   265 
       
   266     bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
       
   267     bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
       
   268 
       
   269     quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
       
   270     quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
       
   271 
       
   272     for line in filecont_source:
       
   273         line = line[:-1]
       
   274 
       
   275         # encode character entities
       
   276         line = string.replace(line, '&', '&amp;')
       
   277         line = string.replace(line, '<', '&lt;')
       
   278         line = string.replace(line, '>', '&gt;')
       
   279 
       
   280         # start entry: publication type (store for later use)
       
   281         if pubtype_rex.match(line):
       
   282         # want @<alphanumeric chars><spaces>{<spaces><any chars>,
       
   283             entrycont = {}
       
   284             entry = []
       
   285             entrytype = pubtype_rex.sub('\g<1>',line)
       
   286             entrytype = string.lower(entrytype)
       
   287             # entryid   = pubtype_rex.sub('\g<2>', line)
       
   288 
       
   289         # end entry if just a }
       
   290         elif endtype_rex.match(line):
       
   291             # generate doxygen code for the entry
       
   292 
       
   293             # enty type related formattings
       
   294             if entrytype in ('book', 'inbook'):
       
   295                 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
       
   296                 if not entrycont.has_key('author'):
       
   297                     entrycont['author'] = entrycont['editor']
       
   298                     entrycont['author']['text'] += ', editors'
       
   299             elif entrytype == 'article':
       
   300                 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
       
   301             elif entrytype in ('inproceedings', 'incollection', 'conference'):
       
   302                 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
       
   303             elif entrytype == 'techreport':
       
   304                 if not entrycont.has_key('type'):
       
   305                     entrycont['type'] = 'Technical report'
       
   306             elif entrytype == 'mastersthesis':
       
   307                 entrycont['type'] = 'Master\'s thesis'
       
   308             elif entrytype == 'phdthesis':
       
   309                 entrycont['type'] = 'PhD thesis'
       
   310 
       
   311             for eline in entrycont:
       
   312                 if eline != '':
       
   313                     eline = latexreplacements(eline)
       
   314 
       
   315             if entrycont.has_key('pages') and (entrycont['pages'] != ''):
       
   316                 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
       
   317 
       
   318             if entrycont.has_key('author') and (entrycont['author'] != ''):
       
   319                 entry.append(entrycont['author']['text'] + '.')
       
   320             if entrycont.has_key('title') and (entrycont['title'] != ''):
       
   321                 entry.append(entrycont['title'] + '.')
       
   322             if entrycont.has_key('journal') and (entrycont['journal'] != ''):
       
   323                 entry.append(entrycont['journal'] + ',')
       
   324             if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
       
   325                 entry.append('In ' + entrycont['booktitle'] + ',')
       
   326             if entrycont.has_key('type') and (entrycont['type'] != ''):
       
   327                 eline = entrycont['type']
       
   328                 if entrycont.has_key('number') and (entrycont['number'] != ''):
       
   329                     eline += ' ' + entrycont['number']
       
   330                 eline += ','
       
   331                 entry.append(eline)
       
   332             if entrycont.has_key('institution') and (entrycont['institution'] != ''):
       
   333                 entry.append(entrycont['institution'] + ',')
       
   334             if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
       
   335                 entry.append(entrycont['publisher'] + ',')
       
   336             if entrycont.has_key('school') and (entrycont['school'] != ''):
       
   337                 entry.append(entrycont['school'] + ',')
       
   338             if entrycont.has_key('address') and (entrycont['address'] != ''):
       
   339                 entry.append(entrycont['address'] + ',')
       
   340             if entrycont.has_key('edition') and (entrycont['edition'] != ''):
       
   341                 entry.append(entrycont['edition'] + ' edition,')
       
   342             if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
       
   343                 entry.append(entrycont['howpublished'] + ',')
       
   344             if entrycont.has_key('volume') and (entrycont['volume'] != ''):
       
   345                 eline = entrycont['volume'];
       
   346                 if entrycont.has_key('number') and (entrycont['number'] != ''):
       
   347                     eline += '(' + entrycont['number'] + ')'
       
   348                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
       
   349                     eline += ':' + entrycont['pages']
       
   350                 eline += ','
       
   351                 entry.append(eline)
       
   352             else:
       
   353                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
       
   354                     entry.append('pages ' + entrycont['pages'] + ',')
       
   355             if entrycont.has_key('year') and (entrycont['year'] != ''):
       
   356                 if entrycont.has_key('month') and (entrycont['month'] != ''):
       
   357                     entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
       
   358                 else:
       
   359                     entry.append(entrycont['year'] + '.')
       
   360             if entrycont.has_key('note') and (entrycont['note'] != ''):
       
   361                 entry.append(entrycont['note'] + '.')
       
   362 
       
   363             # generate keys for sorting and for the output
       
   364             sortkey = ''
       
   365             bibkey = ''
       
   366             if entrycont.has_key('author'):
       
   367                 for author in entrycont['author']['list']:
       
   368                     sortkey += copychars(author, author.rfind(' ')+1, len(author))
       
   369                 bibkey = entrycont['author']['abbrev']
       
   370             else:
       
   371                 bibkey = 'x'
       
   372             if entrycont.has_key('year'):
       
   373                 sortkey += entrycont['year']
       
   374                 bibkey += entrycont['year'][-2:]
       
   375             if entrycont.has_key('title'):
       
   376                 sortkey += entrycont['title']
       
   377             if entrycont.has_key('key'):
       
   378                 sortkey = entrycont['key'] + sortkey
       
   379                 bibkey = entrycont['key']
       
   380             entry.insert(0, sortkey)
       
   381             entry.insert(1, bibkey)
       
   382            
       
   383             # add the entry to the file contents
       
   384             filecont.append(entry)
       
   385 
       
   386         else:
       
   387             # field, publication info
       
   388             field = ''
       
   389             data = ''
       
   390             
       
   391             # field = {data} entries
       
   392             if bracedata_rex.match(line):
       
   393                 field = bracefield_rex.sub('\g<1>', line)
       
   394                 field = string.lower(field)
       
   395                 data =  bracedata_rex.sub('\g<2>', line)
       
   396 
       
   397             # field = "data" entries
       
   398             elif quotedata_rex.match(line):
       
   399                 field = quotefield_rex.sub('\g<1>', line)
       
   400                 field = string.lower(field)
       
   401                 data =  quotedata_rex.sub('\g<2>', line)
       
   402 
       
   403             # field = data entries
       
   404             elif data_rex.match(line):
       
   405                 field = field_rex.sub('\g<1>', line)
       
   406                 field = string.lower(field)
       
   407                 data =  data_rex.sub('\g<2>', line)
       
   408             
       
   409             if field in ('author', 'editor'):
       
   410                 entrycont[field] = bibtexauthor(data)
       
   411                 line = ''
       
   412             elif field == 'title':
       
   413                 line = bibtextitle(data, entrytype)
       
   414             elif field != '':
       
   415                 line = removebraces(transformurls(data.strip()))
       
   416 
       
   417             if line != '':
       
   418                 line = latexreplacements(line)
       
   419                 entrycont[field] = line
       
   420 
       
   421 
       
   422     # sort entries
       
   423     filecont.sort(entry_cmp)
       
   424     
       
   425     # count the bibtex keys
       
   426     keytable = {}
       
   427     counttable = {}
       
   428     for entry in filecont:
       
   429         bibkey = entry[1]
       
   430         if not keytable.has_key(bibkey):
       
   431             keytable[bibkey] = 1
       
   432         else:
       
   433             keytable[bibkey] += 1
       
   434 
       
   435     for bibkey in keytable.keys():
       
   436         counttable[bibkey] = 0
       
   437     
       
   438     # generate output
       
   439     for entry in filecont:
       
   440         # generate output key form the bibtex key
       
   441         bibkey = entry[1]
       
   442         if keytable[bibkey] == 1:
       
   443             outkey = bibkey
       
   444         else:
       
   445             outkey = bibkey + chr(97 + counttable[bibkey])
       
   446         counttable[bibkey] += 1
       
   447         
       
   448         # append the entry code to the output
       
   449         file.append('<tr valign="top">\n' + \
       
   450                     '<td>[' + outkey + ']</td>')
       
   451         file.append('<td>')
       
   452         file.append('\\anchor ' + outkey)
       
   453         for line in entry[2:]:
       
   454             file.append(line)
       
   455         file.append('</td>\n</tr>')
       
   456         file.append('')
       
   457 
       
   458     return file
       
   459 
       
   460 
       
   461 #
       
   462 # return 1 iff abbr is in line but not inside braces or quotes
       
   463 # assumes that abbr appears only once on the line (out of braces and quotes)
       
   464 #
       
   465 def verify_out_of_braces(line, abbr):
       
   466 
       
   467     phrase_split = delimiter_rex.split(line)
       
   468 
       
   469     abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
       
   470 
       
   471     open_brace = 0
       
   472     open_quote = 0
       
   473 
       
   474     for phrase in phrase_split:
       
   475         if phrase == "{":
       
   476             open_brace = open_brace + 1
       
   477         elif phrase == "}":
       
   478             open_brace = open_brace - 1
       
   479         elif phrase == '"':
       
   480             if open_quote == 1:
       
   481                 open_quote = 0
       
   482             else:
       
   483                 open_quote = 1
       
   484         elif abbr_rex.search(phrase):
       
   485             if open_brace == 0 and open_quote == 0:
       
   486                 return 1
       
   487 
       
   488     return 0
       
   489 
       
   490 
       
   491 #
       
   492 # a line in the form phrase1 # phrase2 # ... # phrasen
       
   493 # is returned as phrase1 phrase2 ... phrasen
       
   494 # with the correct punctuation
       
   495 # Bug: Doesn't always work with multiple abbreviations plugged in
       
   496 #
       
   497 def concat_line(line):
       
   498     # only look at part after equals
       
   499     field = field_rex.sub('\g<1>',line)
       
   500     rest = field_rex.sub('\g<2>',line)
       
   501 
       
   502     concat_line = field + ' ='
       
   503 
       
   504     pound_split = concatsplit_rex.split(rest)
       
   505 
       
   506     phrase_count = 0
       
   507     length = len(pound_split)
       
   508 
       
   509     for phrase in pound_split:
       
   510         phrase = phrase.strip()
       
   511         if phrase_count != 0:
       
   512             if phrase.startswith('"') or phrase.startswith('{'):
       
   513                 phrase = phrase[1:]
       
   514         elif phrase.startswith('"'):
       
   515             phrase = phrase.replace('"','{',1)
       
   516 
       
   517         if phrase_count != length-1:
       
   518             if phrase.endswith('"') or phrase.endswith('}'):
       
   519                 phrase = phrase[:-1]
       
   520         else:
       
   521             if phrase.endswith('"'):
       
   522                 phrase = phrase[:-1]
       
   523                 phrase = phrase + "}"
       
   524             elif phrase.endswith('",'):
       
   525                 phrase = phrase[:-2]
       
   526                 phrase = phrase + "},"
       
   527 
       
   528         # if phrase did have \#, add the \# back
       
   529         if phrase.endswith('\\'):
       
   530             phrase = phrase + "#"
       
   531         concat_line = concat_line + ' ' + phrase
       
   532 
       
   533         phrase_count = phrase_count + 1
       
   534 
       
   535     return concat_line
       
   536 
       
   537 
       
   538 #
       
   539 # substitute abbreviations into filecont
       
   540 # @param filecont_source - string of data from file
       
   541 #
       
   542 def bibtex_replace_abbreviations(filecont_source):
       
   543     filecont = filecont_source.splitlines()
       
   544 
       
   545     #  These are defined in bibtex, so we'll define them too
       
   546     abbr_list = ['jan','feb','mar','apr','may','jun',
       
   547                  'jul','aug','sep','oct','nov','dec']
       
   548     value_list = ['January','February','March','April',
       
   549                   'May','June','July','August','September',
       
   550                   'October','November','December']
       
   551 
       
   552     abbr_rex = []
       
   553     total_abbr_count = 0
       
   554 
       
   555     front = '\\b'
       
   556     back = '(,?)\\b'
       
   557 
       
   558     for x in abbr_list:
       
   559         abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
       
   560         total_abbr_count = total_abbr_count + 1
       
   561 
       
   562 
       
   563     abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
       
   564                              re.I)
       
   565 
       
   566     comment_rex = re.compile('@comment\s*{',re.I)
       
   567     preamble_rex = re.compile('@preamble\s*{',re.I)
       
   568 
       
   569     waiting_for_end_string = 0
       
   570     i = 0
       
   571     filecont2 = ''
       
   572 
       
   573     for line in filecont:
       
   574         if line == ' ' or line == '':
       
   575             continue
       
   576 
       
   577         if waiting_for_end_string:
       
   578             if re.search('}',line):
       
   579                 waiting_for_end_string = 0
       
   580                 continue
       
   581 
       
   582         if abbrdef_rex.search(line):
       
   583             abbr = abbrdef_rex.sub('\g<1>', line)
       
   584 
       
   585             if abbr_list.count(abbr) == 0:
       
   586                 val = abbrdef_rex.sub('\g<2>', line)
       
   587                 abbr_list.append(abbr)
       
   588                 value_list.append(string.strip(val))
       
   589                 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
       
   590                 total_abbr_count = total_abbr_count + 1
       
   591             waiting_for_end_string = 1
       
   592             continue
       
   593 
       
   594         if comment_rex.search(line):
       
   595             waiting_for_end_string = 1
       
   596             continue
       
   597 
       
   598         if preamble_rex.search(line):
       
   599             waiting_for_end_string = 1
       
   600             continue
       
   601 
       
   602 
       
   603         # replace subsequent abbreviations with the value
       
   604         abbr_count = 0
       
   605 
       
   606         for x in abbr_list:
       
   607 
       
   608             if abbr_rex[abbr_count].search(line):
       
   609                 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
       
   610                     line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
       
   611                 # Check for # concatenations
       
   612                 if concatsplit_rex.search(line):
       
   613                     line = concat_line(line)
       
   614             abbr_count = abbr_count + 1
       
   615 
       
   616 
       
   617         filecont2 = filecont2 + line + '\n'
       
   618         i = i+1
       
   619 
       
   620 
       
   621     # Do one final pass over file
       
   622 
       
   623     # make sure that didn't end up with {" or }" after the substitution
       
   624     filecont2 = filecont2.replace('{"','{{')
       
   625     filecont2 = filecont2.replace('"}','}}')
       
   626 
       
   627     afterquotevalue_rex = re.compile('"\s*,\s*')
       
   628     afterbrace_rex = re.compile('"\s*}')
       
   629     afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
       
   630 
       
   631     # add new lines to data that changed because of abbreviation substitutions
       
   632     filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
       
   633     filecont2 = afterbrace_rex.sub('"\n}', filecont2)
       
   634     filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
       
   635 
       
   636     return filecont2
       
   637 
       
   638 #
       
   639 # convert @type( ... ) to @type{ ... }
       
   640 #
       
   641 def no_outer_parens(filecont):
       
   642 
       
   643     # do checking for open parens
       
   644     # will convert to braces
       
   645     paren_split = re.split('([(){}])',filecont)
       
   646 
       
   647     open_paren_count = 0
       
   648     open_type = 0
       
   649     look_next = 0
       
   650 
       
   651     # rebuild filecont
       
   652     filecont = ''
       
   653 
       
   654     at_rex = re.compile('@\w*')
       
   655 
       
   656     for phrase in paren_split:
       
   657         if look_next == 1:
       
   658             if phrase == '(':
       
   659                 phrase = '{'
       
   660                 open_paren_count = open_paren_count + 1
       
   661             else:
       
   662                 open_type = 0
       
   663             look_next = 0
       
   664 
       
   665         if phrase == '(':
       
   666             open_paren_count = open_paren_count + 1
       
   667 
       
   668         elif phrase == ')':
       
   669             open_paren_count = open_paren_count - 1
       
   670             if open_type == 1 and open_paren_count == 0:
       
   671                 phrase = '}'
       
   672                 open_type = 0
       
   673 
       
   674         elif at_rex.search( phrase ):
       
   675             open_type = 1
       
   676             look_next = 1
       
   677 
       
   678         filecont = filecont + phrase
       
   679 
       
   680     return filecont
       
   681 
       
   682 
       
   683 #
       
   684 # make all whitespace into just one space
       
   685 # format the bibtex file into a usable form.
       
   686 #
       
   687 def bibtexwasher(filecont_source):
       
   688 
       
   689     space_rex = re.compile('\s+')
       
   690     comment_rex = re.compile('\s*%')
       
   691 
       
   692     filecont = []
       
   693 
       
   694     # remove trailing and excessive whitespace
       
   695     # ignore comments
       
   696     for line in filecont_source:
       
   697         line = string.strip(line)
       
   698         line = space_rex.sub(' ', line)
       
   699         # ignore comments
       
   700         if not comment_rex.match(line) and line != '':
       
   701             filecont.append(' '+ line)
       
   702 
       
   703     filecont = string.join(filecont, '')
       
   704 
       
   705     # the file is in one long string
       
   706 
       
   707     filecont = no_outer_parens(filecont)
       
   708 
       
   709     #
       
   710     # split lines according to preferred syntax scheme
       
   711     #
       
   712     filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
       
   713 
       
   714     # add new lines after commas that are after values
       
   715     filecont = re.sub('"\s*,', '",\n', filecont)
       
   716     filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
       
   717     filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
       
   718                           '\n\n\g<1>\g<2>,\n', filecont)
       
   719 
       
   720     # add new lines after }
       
   721     filecont = re.sub('"\s*}','"\n}\n', filecont)
       
   722     filecont = re.sub('}\s*,','},\n', filecont)
       
   723 
       
   724 
       
   725     filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
       
   726 
       
   727     # character encoding, reserved latex characters
       
   728     filecont = re.sub('{\\\&}', '&', filecont)
       
   729     filecont = re.sub('\\\&', '&', filecont)
       
   730 
       
   731     # do checking for open braces to get format correct
       
   732     open_brace_count = 0
       
   733     brace_split = re.split('([{}])',filecont)
       
   734 
       
   735     # rebuild filecont
       
   736     filecont = ''
       
   737 
       
   738     for phrase in brace_split:
       
   739         if phrase == '{':
       
   740             open_brace_count = open_brace_count + 1
       
   741         elif phrase == '}':
       
   742             open_brace_count = open_brace_count - 1
       
   743             if open_brace_count == 0:
       
   744                 filecont = filecont + '\n'
       
   745 
       
   746         filecont = filecont + phrase
       
   747 
       
   748     filecont2 = bibtex_replace_abbreviations(filecont)
       
   749 
       
   750     # gather
       
   751     filecont = filecont2.splitlines()
       
   752     i=0
       
   753     j=0         # count the number of blank lines
       
   754     for line in filecont:
       
   755         # ignore blank lines
       
   756         if line == '' or line == ' ':
       
   757             j = j+1
       
   758             continue
       
   759         filecont[i] = line + '\n'
       
   760         i = i+1
       
   761 
       
   762     # get rid of the extra stuff at the end of the array
       
   763     # (The extra stuff are duplicates that are in the array because
       
   764     # blank lines were removed.)
       
   765     length = len( filecont)
       
   766     filecont[length-j:length] = []
       
   767 
       
   768     return filecont
       
   769 
       
   770 
       
   771 def filehandler(filepath):
       
   772     try:
       
   773         fd = open(filepath, 'r')
       
   774         filecont_source = fd.readlines()
       
   775         fd.close()
       
   776     except:
       
   777         print 'Could not open file:', filepath
       
   778     washeddata = bibtexwasher(filecont_source)
       
   779     outdata = bibtexdecoder(washeddata)
       
   780     print '/**'
       
   781     print '\page references References'
       
   782     print
       
   783     print '<table border="0" cellspacing="5px" width="100%">'
       
   784     print
       
   785     for line in outdata:
       
   786         print line
       
   787     print '</table>'
       
   788     print
       
   789     print '*/'
       
   790 
       
   791 
       
   792 # main program
       
   793 
       
   794 def main():
       
   795     import sys
       
   796     if sys.argv[1:]:
       
   797         filepath = sys.argv[1]
       
   798     else:
       
   799         print "No input file"
       
   800         sys.exit()
       
   801     filehandler(filepath)
       
   802 
       
   803 if __name__ == "__main__": main()
       
   804 
       
   805 
       
   806 # end python script