scripts/bib2dox.py
author Peter Kovacs <kpeter@inf.elte.hu>
Sat, 08 Jan 2011 22:51:16 +0100
changeset 1201 9a51db038228
parent 801 2de0fc630899
child 1220 eb2f9d453070
permissions -rwxr-xr-x
Document and greatly improve TSP algorithms (#386)

- Add LEMON headers.
- Add Doxygen doc for all classes and their members.
- Clarify and unify the public API of the algorithms.
- Various small improvements in the implementations to make
them clearer and faster.
- Avoid using adaptors in ChristofidesTsp.
     1 #! /usr/bin/env python
     2 """
     3   BibTeX to Doxygen converter
     4   Usage: python bib2dox.py bibfile.bib > bibfile.dox
     5 
     6   This file is a part of LEMON, a generic C++ optimization library.
     7 
     8   **********************************************************************
     9 
    10   This code is the modification of the BibTeX to XML converter
    11   by Vidar Bronken Gundersen et al.
    12   See the original copyright notices below. 
    13 
    14   **********************************************************************
    15 
    16   Decoder for bibliographic data, BibTeX
    17   Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
    18 
    19   v.8
    20   (c)2002-06-23 Vidar Bronken Gundersen
    21   http://bibtexml.sf.net/
    22   Reuse approved as long as this notification is kept.
    23   Licence: GPL.
    24 
    25   Contributions/thanks to:
    26   Egon Willighagen, http://sf.net/projects/jreferences/
    27   Richard Mahoney (for providing a test case)
    28 
    29   Editted by Sara Sprenkle to be more robust and handle more bibtex features.
    30   (c) 2003-01-15
    31 
    32   1.  Changed bibtex: tags to bibxml: tags.
    33   2.  Use xmlns:bibxml="http://bibtexml.sf.net/"
    34   3.  Allow spaces between @type and first {
    35   4.  "author" fields with multiple authors split by " and "
    36       are put in separate xml "bibxml:author" tags.
    37   5.  Option for Titles: words are capitalized
    38       only if first letter in title or capitalized inside braces
    39   6.  Removes braces from within field values
    40   7.  Ignores comments in bibtex file (including @comment{ or % )
    41   8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'
    42   9.  Handles bibtex @string abbreviations
    43         --> includes bibtex's default abbreviations for months
    44         --> does concatenation of abbr # " more " and " more " # abbr
    45   10. Handles @type( ... ) or @type{ ... }
    46   11. The keywords field is split on , or ; and put into separate xml
    47       "bibxml:keywords" tags
    48   12. Ignores @preamble
    49 
    50   Known Limitations
    51   1.  Does not transform Latex encoding like math mode and special
    52       latex symbols.
    53   2.  Does not parse author fields into first and last names.
    54       E.g., It does not do anything special to an author whose name is
    55       in the form LAST_NAME, FIRST_NAME
    56       In "author" tag, will show up as
    57       <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>
    58   3.  Does not handle "crossref" fields other than to print
    59       <bibxml:crossref>...</bibxml:crossref>
    60   4.  Does not inform user of the input's format errors.  You just won't
    61       be able to transform the file later with XSL
    62 
    63   You will have to manually edit the XML output if you need to handle
    64   these (and unknown) limitations.
    65 
    66 """
    67 
    68 import string, re
    69 
    70 # set of valid name characters
    71 valid_name_chars = '[\w\-:]'
    72 
    73 #
    74 # define global regular expression variables
    75 #
    76 author_rex = re.compile('\s+and\s+')
    77 rembraces_rex = re.compile('[{}]')
    78 capitalize_rex = re.compile('({[^}]*})')
    79 
    80 # used by bibtexkeywords(data)
    81 keywords_rex = re.compile('[,;]')
    82 
    83 # used by concat_line(line)
    84 concatsplit_rex = re.compile('\s*#\s*')
    85 
    86 # split on {, }, or " in verify_out_of_braces
    87 delimiter_rex = re.compile('([{}"])',re.I)
    88 
    89 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
    90 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
    91 
    92 url_rex = re.compile('\\\url\{([^}]*)\}')
    93 
    94 #
    95 # styles for html formatting
    96 #
    97 divstyle = 'margin-top: -4ex; margin-left: 8em;'
    98 
    99 #
   100 # return the string parameter without braces
   101 #
   102 def transformurls(str):
   103     return url_rex.sub(r'<a href="\1">\1</a>', str)
   104 
   105 #
   106 # return the string parameter without braces
   107 #
   108 def removebraces(str):
   109     return rembraces_rex.sub('', str)
   110 
   111 #
   112 # latex-specific replacements
   113 # (do this after braces were removed)
   114 #
   115 def latexreplacements(line):
   116     line = string.replace(line, '~', '&nbsp;')
   117     line = string.replace(line, '\\\'a', '&aacute;')
   118     line = string.replace(line, '\\"a', '&auml;')
   119     line = string.replace(line, '\\\'e', '&eacute;')
   120     line = string.replace(line, '\\"e', '&euml;')
   121     line = string.replace(line, '\\\'i', '&iacute;')
   122     line = string.replace(line, '\\"i', '&iuml;')
   123     line = string.replace(line, '\\\'o', '&oacute;')
   124     line = string.replace(line, '\\"o', '&ouml;')
   125     line = string.replace(line, '\\\'u', '&uacute;')
   126     line = string.replace(line, '\\"u', '&uuml;')
   127     line = string.replace(line, '\\H o', '&otilde;')
   128     line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist
   129     line = string.replace(line, '\\\'A', '&Aacute;')
   130     line = string.replace(line, '\\"A', '&Auml;')
   131     line = string.replace(line, '\\\'E', '&Eacute;')
   132     line = string.replace(line, '\\"E', '&Euml;')
   133     line = string.replace(line, '\\\'I', '&Iacute;')
   134     line = string.replace(line, '\\"I', '&Iuml;')
   135     line = string.replace(line, '\\\'O', '&Oacute;')
   136     line = string.replace(line, '\\"O', '&Ouml;')
   137     line = string.replace(line, '\\\'U', '&Uacute;')
   138     line = string.replace(line, '\\"U', '&Uuml;')
   139     line = string.replace(line, '\\H O', '&Otilde;')
   140     line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist
   141 
   142     return line
   143 
   144 #
   145 # copy characters form a string decoding html expressions (&xyz;)
   146 #
   147 def copychars(str, ifrom, count):
   148     result = ''
   149     i = ifrom
   150     c = 0
   151     html_spec = False
   152     while (i < len(str)) and (c < count):
   153         if str[i] == '&':
   154             html_spec = True;
   155             if i+1 < len(str):
   156                 result += str[i+1]
   157             c += 1
   158             i += 2
   159         else:
   160             if not html_spec:
   161                 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \
   162                    ((str[i] >= 'a') and (str[i] <= 'z')):
   163                     result += str[i]
   164                     c += 1
   165             elif str[i] == ';':
   166                 html_spec = False;
   167             i += 1
   168     
   169     return result
   170 
   171 
   172 # 
   173 # Handle a list of authors (separated by 'and').
   174 # It gives back an array of the follwing values:
   175 #  - num: the number of authors,
   176 #  - list: the list of the author names,
   177 #  - text: the bibtex text (separated by commas and/or 'and')
   178 #  - abbrev: abbreviation that can be used for indicate the
   179 #    bibliography entries
   180 #
   181 def bibtexauthor(data):
   182     result = {}
   183     bibtex = ''
   184     result['list'] = author_rex.split(data)
   185     result['num'] = len(result['list'])
   186     for i, author in enumerate(result['list']):
   187         # general transformations
   188         author = latexreplacements(removebraces(author.strip()))
   189         # transform "Xyz, A. B." to "A. B. Xyz"
   190         pos = author.find(',')
   191         if pos != -1:
   192             author = author[pos+1:].strip() + ' ' + author[:pos].strip()
   193         result['list'][i] = author
   194         bibtex += author + '#'
   195     bibtex = bibtex[:-1]
   196     if result['num'] > 1:
   197         ix = bibtex.rfind('#')
   198         if result['num'] == 2:
   199             bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]
   200         else:
   201             bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]
   202     bibtex = bibtex.replace('#', ', ')
   203     result['text'] = bibtex
   204     
   205     result['abbrev'] = ''
   206     for author in result['list']:
   207         pos = author.rfind(' ') + 1
   208         count = 1
   209         if result['num'] == 1:
   210             count = 3
   211         result['abbrev'] += copychars(author, pos, count)
   212 
   213     return result
   214 
   215 
   216 #
   217 # data = title string
   218 # @return the capitalized title (first letter is capitalized), rest are capitalized
   219 # only if capitalized inside braces
   220 #
   221 def capitalizetitle(data):
   222     title_list = capitalize_rex.split(data)
   223     title = ''
   224     count = 0
   225     for phrase in title_list:
   226          check = string.lstrip(phrase)
   227 
   228          # keep phrase's capitalization the same
   229          if check.find('{') == 0:
   230               title += removebraces(phrase)
   231          else:
   232          # first word --> capitalize first letter (after spaces)
   233               if count == 0:
   234                   title += check.capitalize()
   235               else:
   236                   title += phrase.lower()
   237          count = count + 1
   238 
   239     return title
   240 
   241 
   242 #
   243 # @return the bibtex for the title
   244 # @param data --> title string
   245 # braces are removed from title
   246 #
   247 def bibtextitle(data, entrytype):
   248     if entrytype in ('book', 'inbook'):
   249         title = removebraces(data.strip())
   250     else:
   251         title = removebraces(capitalizetitle(data.strip()))
   252     bibtex = title
   253     return bibtex
   254 
   255 
   256 #
   257 # function to compare entry lists
   258 #
   259 def entry_cmp(x, y):
   260     return cmp(x[0], y[0])
   261 
   262 
   263 #
   264 # print the XML for the transformed "filecont_source"
   265 #
   266 def bibtexdecoder(filecont_source):
   267     filecont = []
   268     file = []
   269     
   270     # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   271     pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
   272     endtype_rex = re.compile('}\s*$')
   273     endtag_rex = re.compile('^\s*}\s*$')
   274 
   275     bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   276     bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
   277 
   278     quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
   279     quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
   280 
   281     for line in filecont_source:
   282         line = line[:-1]
   283 
   284         # encode character entities
   285         line = string.replace(line, '&', '&amp;')
   286         line = string.replace(line, '<', '&lt;')
   287         line = string.replace(line, '>', '&gt;')
   288 
   289         # start entry: publication type (store for later use)
   290         if pubtype_rex.match(line):
   291         # want @<alphanumeric chars><spaces>{<spaces><any chars>,
   292             entrycont = {}
   293             entry = []
   294             entrytype = pubtype_rex.sub('\g<1>',line)
   295             entrytype = string.lower(entrytype)
   296             entryid   = pubtype_rex.sub('\g<2>', line)
   297 
   298         # end entry if just a }
   299         elif endtype_rex.match(line):
   300             # generate doxygen code for the entry
   301 
   302             # enty type related formattings
   303             if entrytype in ('book', 'inbook'):
   304                 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'
   305                 if not entrycont.has_key('author'):
   306                     entrycont['author'] = entrycont['editor']
   307                     entrycont['author']['text'] += ', editors'
   308             elif entrytype == 'article':
   309                 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'
   310             elif entrytype in ('inproceedings', 'incollection', 'conference'):
   311                 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'
   312             elif entrytype == 'techreport':
   313                 if not entrycont.has_key('type'):
   314                     entrycont['type'] = 'Technical report'
   315             elif entrytype == 'mastersthesis':
   316                 entrycont['type'] = 'Master\'s thesis'
   317             elif entrytype == 'phdthesis':
   318                 entrycont['type'] = 'PhD thesis'
   319 
   320             for eline in entrycont:
   321                 if eline != '':
   322                     eline = latexreplacements(eline)
   323 
   324             if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   325                 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')
   326 
   327             if entrycont.has_key('author') and (entrycont['author'] != ''):
   328                 entry.append(entrycont['author']['text'] + '.')
   329             if entrycont.has_key('title') and (entrycont['title'] != ''):
   330                 entry.append(entrycont['title'] + '.')
   331             if entrycont.has_key('journal') and (entrycont['journal'] != ''):
   332                 entry.append(entrycont['journal'] + ',')
   333             if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):
   334                 entry.append('In ' + entrycont['booktitle'] + ',')
   335             if entrycont.has_key('type') and (entrycont['type'] != ''):
   336                 eline = entrycont['type']
   337                 if entrycont.has_key('number') and (entrycont['number'] != ''):
   338                     eline += ' ' + entrycont['number']
   339                 eline += ','
   340                 entry.append(eline)
   341             if entrycont.has_key('institution') and (entrycont['institution'] != ''):
   342                 entry.append(entrycont['institution'] + ',')
   343             if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):
   344                 entry.append(entrycont['publisher'] + ',')
   345             if entrycont.has_key('school') and (entrycont['school'] != ''):
   346                 entry.append(entrycont['school'] + ',')
   347             if entrycont.has_key('address') and (entrycont['address'] != ''):
   348                 entry.append(entrycont['address'] + ',')
   349             if entrycont.has_key('edition') and (entrycont['edition'] != ''):
   350                 entry.append(entrycont['edition'] + ' edition,')
   351             if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):
   352                 entry.append(entrycont['howpublished'] + ',')
   353             if entrycont.has_key('volume') and (entrycont['volume'] != ''):
   354                 eline = entrycont['volume'];
   355                 if entrycont.has_key('number') and (entrycont['number'] != ''):
   356                     eline += '(' + entrycont['number'] + ')'
   357                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   358                     eline += ':' + entrycont['pages']
   359                 eline += ','
   360                 entry.append(eline)
   361             else:
   362                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):
   363                     entry.append('pages ' + entrycont['pages'] + ',')
   364             if entrycont.has_key('year') and (entrycont['year'] != ''):
   365                 if entrycont.has_key('month') and (entrycont['month'] != ''):
   366                     entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')
   367                 else:
   368                     entry.append(entrycont['year'] + '.')
   369             if entrycont.has_key('note') and (entrycont['note'] != ''):
   370                 entry.append(entrycont['note'] + '.')
   371             if entrycont.has_key('url') and (entrycont['url'] != ''):
   372                 entry.append(entrycont['url'] + '.')
   373 
   374             # generate keys for sorting and for the output
   375             sortkey = ''
   376             bibkey = ''
   377             if entrycont.has_key('author'):
   378                 for author in entrycont['author']['list']:
   379                     sortkey += copychars(author, author.rfind(' ')+1, len(author))
   380                 bibkey = entrycont['author']['abbrev']
   381             else:
   382                 bibkey = 'x'
   383             if entrycont.has_key('year'):
   384                 sortkey += entrycont['year']
   385                 bibkey += entrycont['year'][-2:]
   386             if entrycont.has_key('title'):
   387                 sortkey += entrycont['title']
   388             if entrycont.has_key('key'):
   389                 sortkey = entrycont['key'] + sortkey
   390                 bibkey = entrycont['key']
   391             entry.insert(0, sortkey)
   392             entry.insert(1, bibkey)
   393             entry.insert(2, entryid)
   394            
   395             # add the entry to the file contents
   396             filecont.append(entry)
   397 
   398         else:
   399             # field, publication info
   400             field = ''
   401             data = ''
   402             
   403             # field = {data} entries
   404             if bracedata_rex.match(line):
   405                 field = bracefield_rex.sub('\g<1>', line)
   406                 field = string.lower(field)
   407                 data =  bracedata_rex.sub('\g<2>', line)
   408 
   409             # field = "data" entries
   410             elif quotedata_rex.match(line):
   411                 field = quotefield_rex.sub('\g<1>', line)
   412                 field = string.lower(field)
   413                 data =  quotedata_rex.sub('\g<2>', line)
   414 
   415             # field = data entries
   416             elif data_rex.match(line):
   417                 field = field_rex.sub('\g<1>', line)
   418                 field = string.lower(field)
   419                 data =  data_rex.sub('\g<2>', line)
   420 
   421             if field == 'url':
   422                 data = '\\url{' + data.strip() + '}'
   423             
   424             if field in ('author', 'editor'):
   425                 entrycont[field] = bibtexauthor(data)
   426                 line = ''
   427             elif field == 'title':
   428                 line = bibtextitle(data, entrytype)
   429             elif field != '':
   430                 line = removebraces(transformurls(data.strip()))
   431 
   432             if line != '':
   433                 line = latexreplacements(line)
   434                 entrycont[field] = line
   435 
   436 
   437     # sort entries
   438     filecont.sort(entry_cmp)
   439     
   440     # count the bibtex keys
   441     keytable = {}
   442     counttable = {}
   443     for entry in filecont:
   444         bibkey = entry[1]
   445         if not keytable.has_key(bibkey):
   446             keytable[bibkey] = 1
   447         else:
   448             keytable[bibkey] += 1
   449 
   450     for bibkey in keytable.keys():
   451         counttable[bibkey] = 0
   452     
   453     # generate output
   454     for entry in filecont:
   455         # generate output key form the bibtex key
   456         bibkey = entry[1]
   457         entryid = entry[2]
   458         if keytable[bibkey] == 1:
   459             outkey = bibkey
   460         else:
   461             outkey = bibkey + chr(97 + counttable[bibkey])
   462         counttable[bibkey] += 1
   463         
   464         # append the entry code to the output
   465         file.append('\\section ' + entryid + ' [' + outkey + ']')
   466         file.append('<div style="' + divstyle + '">')
   467         for line in entry[3:]:
   468             file.append(line)
   469         file.append('</div>')
   470         file.append('')
   471 
   472     return file
   473 
   474 
   475 #
   476 # return 1 iff abbr is in line but not inside braces or quotes
   477 # assumes that abbr appears only once on the line (out of braces and quotes)
   478 #
   479 def verify_out_of_braces(line, abbr):
   480 
   481     phrase_split = delimiter_rex.split(line)
   482 
   483     abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
   484 
   485     open_brace = 0
   486     open_quote = 0
   487 
   488     for phrase in phrase_split:
   489         if phrase == "{":
   490             open_brace = open_brace + 1
   491         elif phrase == "}":
   492             open_brace = open_brace - 1
   493         elif phrase == '"':
   494             if open_quote == 1:
   495                 open_quote = 0
   496             else:
   497                 open_quote = 1
   498         elif abbr_rex.search(phrase):
   499             if open_brace == 0 and open_quote == 0:
   500                 return 1
   501 
   502     return 0
   503 
   504 
   505 #
   506 # a line in the form phrase1 # phrase2 # ... # phrasen
   507 # is returned as phrase1 phrase2 ... phrasen
   508 # with the correct punctuation
   509 # Bug: Doesn't always work with multiple abbreviations plugged in
   510 #
   511 def concat_line(line):
   512     # only look at part after equals
   513     field = field_rex.sub('\g<1>',line)
   514     rest = field_rex.sub('\g<2>',line)
   515 
   516     concat_line = field + ' ='
   517 
   518     pound_split = concatsplit_rex.split(rest)
   519 
   520     phrase_count = 0
   521     length = len(pound_split)
   522 
   523     for phrase in pound_split:
   524         phrase = phrase.strip()
   525         if phrase_count != 0:
   526             if phrase.startswith('"') or phrase.startswith('{'):
   527                 phrase = phrase[1:]
   528         elif phrase.startswith('"'):
   529             phrase = phrase.replace('"','{',1)
   530 
   531         if phrase_count != length-1:
   532             if phrase.endswith('"') or phrase.endswith('}'):
   533                 phrase = phrase[:-1]
   534         else:
   535             if phrase.endswith('"'):
   536                 phrase = phrase[:-1]
   537                 phrase = phrase + "}"
   538             elif phrase.endswith('",'):
   539                 phrase = phrase[:-2]
   540                 phrase = phrase + "},"
   541 
   542         # if phrase did have \#, add the \# back
   543         if phrase.endswith('\\'):
   544             phrase = phrase + "#"
   545         concat_line = concat_line + ' ' + phrase
   546 
   547         phrase_count = phrase_count + 1
   548 
   549     return concat_line
   550 
   551 
   552 #
   553 # substitute abbreviations into filecont
   554 # @param filecont_source - string of data from file
   555 #
   556 def bibtex_replace_abbreviations(filecont_source):
   557     filecont = filecont_source.splitlines()
   558 
   559     #  These are defined in bibtex, so we'll define them too
   560     abbr_list = ['jan','feb','mar','apr','may','jun',
   561                  'jul','aug','sep','oct','nov','dec']
   562     value_list = ['January','February','March','April',
   563                   'May','June','July','August','September',
   564                   'October','November','December']
   565 
   566     abbr_rex = []
   567     total_abbr_count = 0
   568 
   569     front = '\\b'
   570     back = '(,?)\\b'
   571 
   572     for x in abbr_list:
   573         abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   574         total_abbr_count = total_abbr_count + 1
   575 
   576 
   577     abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',
   578                              re.I)
   579 
   580     comment_rex = re.compile('@comment\s*{',re.I)
   581     preamble_rex = re.compile('@preamble\s*{',re.I)
   582 
   583     waiting_for_end_string = 0
   584     i = 0
   585     filecont2 = ''
   586 
   587     for line in filecont:
   588         if line == ' ' or line == '':
   589             continue
   590 
   591         if waiting_for_end_string:
   592             if re.search('}',line):
   593                 waiting_for_end_string = 0
   594                 continue
   595 
   596         if abbrdef_rex.search(line):
   597             abbr = abbrdef_rex.sub('\g<1>', line)
   598 
   599             if abbr_list.count(abbr) == 0:
   600                 val = abbrdef_rex.sub('\g<2>', line)
   601                 abbr_list.append(abbr)
   602                 value_list.append(string.strip(val))
   603                 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )
   604                 total_abbr_count = total_abbr_count + 1
   605             waiting_for_end_string = 1
   606             continue
   607 
   608         if comment_rex.search(line):
   609             waiting_for_end_string = 1
   610             continue
   611 
   612         if preamble_rex.search(line):
   613             waiting_for_end_string = 1
   614             continue
   615 
   616 
   617         # replace subsequent abbreviations with the value
   618         abbr_count = 0
   619 
   620         for x in abbr_list:
   621 
   622             if abbr_rex[abbr_count].search(line):
   623                 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
   624                     line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)
   625                 # Check for # concatenations
   626                 if concatsplit_rex.search(line):
   627                     line = concat_line(line)
   628             abbr_count = abbr_count + 1
   629 
   630 
   631         filecont2 = filecont2 + line + '\n'
   632         i = i+1
   633 
   634 
   635     # Do one final pass over file
   636 
   637     # make sure that didn't end up with {" or }" after the substitution
   638     filecont2 = filecont2.replace('{"','{{')
   639     filecont2 = filecont2.replace('"}','}}')
   640 
   641     afterquotevalue_rex = re.compile('"\s*,\s*')
   642     afterbrace_rex = re.compile('"\s*}')
   643     afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
   644 
   645     # add new lines to data that changed because of abbreviation substitutions
   646     filecont2 = afterquotevalue_rex.sub('",\n', filecont2)
   647     filecont2 = afterbrace_rex.sub('"\n}', filecont2)
   648     filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)
   649 
   650     return filecont2
   651 
   652 #
   653 # convert @type( ... ) to @type{ ... }
   654 #
   655 def no_outer_parens(filecont):
   656 
   657     # do checking for open parens
   658     # will convert to braces
   659     paren_split = re.split('([(){}])',filecont)
   660 
   661     open_paren_count = 0
   662     open_type = 0
   663     look_next = 0
   664 
   665     # rebuild filecont
   666     filecont = ''
   667 
   668     at_rex = re.compile('@\w*')
   669 
   670     for phrase in paren_split:
   671         if look_next == 1:
   672             if phrase == '(':
   673                 phrase = '{'
   674                 open_paren_count = open_paren_count + 1
   675             else:
   676                 open_type = 0
   677             look_next = 0
   678 
   679         if phrase == '(':
   680             open_paren_count = open_paren_count + 1
   681 
   682         elif phrase == ')':
   683             open_paren_count = open_paren_count - 1
   684             if open_type == 1 and open_paren_count == 0:
   685                 phrase = '}'
   686                 open_type = 0
   687 
   688         elif at_rex.search( phrase ):
   689             open_type = 1
   690             look_next = 1
   691 
   692         filecont = filecont + phrase
   693 
   694     return filecont
   695 
   696 
   697 #
   698 # make all whitespace into just one space
   699 # format the bibtex file into a usable form.
   700 #
   701 def bibtexwasher(filecont_source):
   702 
   703     space_rex = re.compile('\s+')
   704     comment_rex = re.compile('\s*%')
   705 
   706     filecont = []
   707 
   708     # remove trailing and excessive whitespace
   709     # ignore comments
   710     for line in filecont_source:
   711         line = string.strip(line)
   712         line = space_rex.sub(' ', line)
   713         # ignore comments
   714         if not comment_rex.match(line) and line != '':
   715             filecont.append(' '+ line)
   716 
   717     filecont = string.join(filecont, '')
   718 
   719     # the file is in one long string
   720 
   721     filecont = no_outer_parens(filecont)
   722 
   723     #
   724     # split lines according to preferred syntax scheme
   725     #
   726     filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)
   727 
   728     # add new lines after commas that are after values
   729     filecont = re.sub('"\s*,', '",\n', filecont)
   730     filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)
   731     filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
   732                           '\n\n\g<1>\g<2>,\n', filecont)
   733 
   734     # add new lines after }
   735     filecont = re.sub('"\s*}','"\n}\n', filecont)
   736     filecont = re.sub('}\s*,','},\n', filecont)
   737 
   738 
   739     filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)
   740 
   741     # character encoding, reserved latex characters
   742     filecont = re.sub('{\\\&}', '&', filecont)
   743     filecont = re.sub('\\\&', '&', filecont)
   744 
   745     # do checking for open braces to get format correct
   746     open_brace_count = 0
   747     brace_split = re.split('([{}])',filecont)
   748 
   749     # rebuild filecont
   750     filecont = ''
   751 
   752     for phrase in brace_split:
   753         if phrase == '{':
   754             open_brace_count = open_brace_count + 1
   755         elif phrase == '}':
   756             open_brace_count = open_brace_count - 1
   757             if open_brace_count == 0:
   758                 filecont = filecont + '\n'
   759 
   760         filecont = filecont + phrase
   761 
   762     filecont2 = bibtex_replace_abbreviations(filecont)
   763 
   764     # gather
   765     filecont = filecont2.splitlines()
   766     i=0
   767     j=0         # count the number of blank lines
   768     for line in filecont:
   769         # ignore blank lines
   770         if line == '' or line == ' ':
   771             j = j+1
   772             continue
   773         filecont[i] = line + '\n'
   774         i = i+1
   775 
   776     # get rid of the extra stuff at the end of the array
   777     # (The extra stuff are duplicates that are in the array because
   778     # blank lines were removed.)
   779     length = len( filecont)
   780     filecont[length-j:length] = []
   781 
   782     return filecont
   783 
   784 
   785 def filehandler(filepath):
   786     try:
   787         fd = open(filepath, 'r')
   788         filecont_source = fd.readlines()
   789         fd.close()
   790     except:
   791         print 'Could not open file:', filepath
   792     washeddata = bibtexwasher(filecont_source)
   793     outdata = bibtexdecoder(washeddata)
   794     print '/**'
   795     print '\page references References'
   796     print
   797     for line in outdata:
   798         print line
   799     print '*/'
   800 
   801 
   802 # main program
   803 
   804 def main():
   805     import sys
   806     if sys.argv[1:]:
   807         filepath = sys.argv[1]
   808     else:
   809         print "No input file"
   810         sys.exit()
   811     filehandler(filepath)
   812 
   813 if __name__ == "__main__": main()
   814 
   815 
   816 # end python script