lemon-1.2: scripts/bib2dox.py@e4554cd6b2bf

     1 #!/usr/bin/env /usr/local/Python/bin/python2.1

     2 """

     3   BibTeX to Doxygen converter

     4   Usage: python bib2dox.py bibfile.bib > bibfile.dox

     6   This code is the modification of the BibTeX to XML converter

     7   by Vidar Bronken Gundersen et al. See the original copyright notices below.

     9   **********************************************************************

    11   Decoder for bibliographic data, BibTeX

    12   Usage: python bibtex2xml.py bibfile.bib > bibfile.xml

    14   v.8

    15   (c)2002-06-23 Vidar Bronken Gundersen

    16   http://bibtexml.sf.net/

    17   Reuse approved as long as this notification is kept.

    18   Licence: GPL.

    20   Contributions/thanks to:

    21   Egon Willighagen, http://sf.net/projects/jreferences/

    22   Richard Mahoney (for providing a test case)

    24   Editted by Sara Sprenkle to be more robust and handle more bibtex features.

    25   (c) 2003-01-15

    27   1.  Changed bibtex: tags to bibxml: tags.

    28   2.  Use xmlns:bibxml="http://bibtexml.sf.net/"

    29   3.  Allow spaces between @type and first {

    30   4.  "author" fields with multiple authors split by " and "

    31       are put in separate xml "bibxml:author" tags.

    32   5.  Option for Titles: words are capitalized

    33       only if first letter in title or capitalized inside braces

    34   6.  Removes braces from within field values

    35   7.  Ignores comments in bibtex file (including @comment{ or % )

    36   8.  Replaces some special latex tags, e.g., replaces ~ with '&#160;'

    37   9.  Handles bibtex @string abbreviations

    38         --> includes bibtex's default abbreviations for months

    39         --> does concatenation of abbr # " more " and " more " # abbr

    40   10. Handles @type( ... ) or @type{ ... }

    41   11. The keywords field is split on , or ; and put into separate xml

    42       "bibxml:keywords" tags

    43   12. Ignores @preamble

    45   Known Limitations

    46   1.  Does not transform Latex encoding like math mode and special

    47       latex symbols.

    48   2.  Does not parse author fields into first and last names.

    49       E.g., It does not do anything special to an author whose name is

    50       in the form LAST_NAME, FIRST_NAME

    51       In "author" tag, will show up as

    52       <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author>

    53   3.  Does not handle "crossref" fields other than to print

    54       <bibxml:crossref>...</bibxml:crossref>

    55   4.  Does not inform user of the input's format errors.  You just won't

    56       be able to transform the file later with XSL

    58   You will have to manually edit the XML output if you need to handle

    59   these (and unknown) limitations.

    61 """

    63 import string, re

    65 # set of valid name characters

    66 valid_name_chars = '[\w\-:]'

    68 #

    69 # define global regular expression variables

    70 #

    71 author_rex = re.compile('\s+and\s+')

    72 rembraces_rex = re.compile('[{}]')

    73 capitalize_rex = re.compile('({\w*})')

    75 # used by bibtexkeywords(data)

    76 keywords_rex = re.compile('[,;]')

    78 # used by concat_line(line)

    79 concatsplit_rex = re.compile('\s*#\s*')

    81 # split on {, }, or " in verify_out_of_braces

    82 delimiter_rex = re.compile('([{}"])',re.I)

    84 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')

    85 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')

    87 url_rex = re.compile('\\\url\{([^}]*)\}')

    89 #

    90 # styles for html formatting

    91 #

    92 divstyle = 'margin-top: -4ex; margin-left: 8em;'

    94 #

    95 # return the string parameter without braces

    96 #

    97 def transformurls(str):

    98     return url_rex.sub(r'<a href="\1">\1</a>', str)

   100 #

   101 # return the string parameter without braces

   102 #

   103 def removebraces(str):

   104     return rembraces_rex.sub('', str)

   106 #

   107 # latex-specific replacements

   108 # (do this after braces were removed)

   109 #

   110 def latexreplacements(line):

   111     line = string.replace(line, '~', '&nbsp;')

   112     line = string.replace(line, '\\\'a', '&aacute;')

   113     line = string.replace(line, '\\"a', '&auml;')

   114     line = string.replace(line, '\\\'e', '&eacute;')

   115     line = string.replace(line, '\\"e', '&euml;')

   116     line = string.replace(line, '\\\'i', '&iacute;')

   117     line = string.replace(line, '\\"i', '&iuml;')

   118     line = string.replace(line, '\\\'o', '&oacute;')

   119     line = string.replace(line, '\\"o', '&ouml;')

   120     line = string.replace(line, '\\\'u', '&uacute;')

   121     line = string.replace(line, '\\"u', '&uuml;')

   122     line = string.replace(line, '\\H o', '&otilde;')

   123     line = string.replace(line, '\\H u', '&uuml;')   # &utilde; does not exist

   124     line = string.replace(line, '\\\'A', '&Aacute;')

   125     line = string.replace(line, '\\"A', '&Auml;')

   126     line = string.replace(line, '\\\'E', '&Eacute;')

   127     line = string.replace(line, '\\"E', '&Euml;')

   128     line = string.replace(line, '\\\'I', '&Iacute;')

   129     line = string.replace(line, '\\"I', '&Iuml;')

   130     line = string.replace(line, '\\\'O', '&Oacute;')

   131     line = string.replace(line, '\\"O', '&Ouml;')

   132     line = string.replace(line, '\\\'U', '&Uacute;')

   133     line = string.replace(line, '\\"U', '&Uuml;')

   134     line = string.replace(line, '\\H O', '&Otilde;')

   135     line = string.replace(line, '\\H U', '&Uuml;')   # &Utilde; does not exist

   137     return line

   139 #

   140 # copy characters form a string decoding html expressions (&xyz;)

   141 #

   142 def copychars(str, ifrom, count):

   143     result = ''

   144     i = ifrom

   145     c = 0

   146     html_spec = False

   147     while (i < len(str)) and (c < count):

   148         if str[i] == '&':

   149             html_spec = True;

   150             if i+1 < len(str):

   151                 result += str[i+1]

   152             c += 1

   153             i += 2

   154         else:

   155             if not html_spec:

   156                 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \

   157                    ((str[i] >= 'a') and (str[i] <= 'z')):

   158                     result += str[i]

   159                     c += 1

   160             elif str[i] == ';':

   161                 html_spec = False;

   162             i += 1

   164     return result

   167 #

   168 # Handle a list of authors (separated by 'and').

   169 # It gives back an array of the follwing values:

   170 #  - num: the number of authors,

   171 #  - list: the list of the author names,

   172 #  - text: the bibtex text (separated by commas and/or 'and')

   173 #  - abbrev: abbreviation that can be used for indicate the

   174 #    bibliography entries

   175 #

   176 def bibtexauthor(data):

   177     result = {}

   178     bibtex = ''

   179     result['list'] = author_rex.split(data)

   180     result['num'] = len(result['list'])

   181     for i, author in enumerate(result['list']):

   182         # general transformations

   183         author = latexreplacements(removebraces(author.strip()))

   184         # transform "Xyz, A. B." to "A. B. Xyz"

   185         pos = author.find(',')

   186         if pos != -1:

   187             author = author[pos+1:].strip() + ' ' + author[:pos].strip()

   188         result['list'][i] = author

   189         bibtex += author + '#'

   190     bibtex = bibtex[:-1]

   191     if result['num'] > 1:

   192         ix = bibtex.rfind('#')

   193         if result['num'] == 2:

   194             bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:]

   195         else:

   196             bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:]

   197     bibtex = bibtex.replace('#', ', ')

   198     result['text'] = bibtex

   200     result['abbrev'] = ''

   201     for author in result['list']:

   202         pos = author.rfind(' ') + 1

   203         count = 1

   204         if result['num'] == 1:

   205             count = 3

   206         result['abbrev'] += copychars(author, pos, count)

   208     return result

   211 #

   212 # data = title string

   213 # @return the capitalized title (first letter is capitalized), rest are capitalized

   214 # only if capitalized inside braces

   215 #

   216 def capitalizetitle(data):

   217     title_list = capitalize_rex.split(data)

   218     title = ''

   219     count = 0

   220     for phrase in title_list:

   221          check = string.lstrip(phrase)

   223          # keep phrase's capitalization the same

   224          if check.find('{') == 0:

   225               title += removebraces(phrase)

   226          else:

   227          # first word --> capitalize first letter (after spaces)

   228               if count == 0:

   229                   title += check.capitalize()

   230               else:

   231                   title += phrase.lower()

   232          count = count + 1

   234     return title

   237 #

   238 # @return the bibtex for the title

   239 # @param data --> title string

   240 # braces are removed from title

   241 #

   242 def bibtextitle(data, entrytype):

   243     if entrytype in ('book', 'inbook'):

   244         title = removebraces(data.strip())

   245     else:

   246         title = removebraces(capitalizetitle(data.strip()))

   247     bibtex = title

   248     return bibtex

   251 #

   252 # function to compare entry lists

   253 #

   254 def entry_cmp(x, y):

   255     return cmp(x[0], y[0])

   258 #

   259 # print the XML for the transformed "filecont_source"

   260 #

   261 def bibtexdecoder(filecont_source):

   262     filecont = []

   263     file = []

   265     # want @<alphanumeric chars><spaces>{<spaces><any chars>,

   266     pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')

   267     endtype_rex = re.compile('}\s*$')

   268     endtag_rex = re.compile('^\s*}\s*$')

   270     bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')

   271     bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')

   273     quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')

   274     quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')

   276     for line in filecont_source:

   277         line = line[:-1]

   279         # encode character entities

   280         line = string.replace(line, '&', '&amp;')

   281         line = string.replace(line, '<', '&lt;')

   282         line = string.replace(line, '>', '&gt;')

   284         # start entry: publication type (store for later use)

   285         if pubtype_rex.match(line):

   286         # want @<alphanumeric chars><spaces>{<spaces><any chars>,

   287             entrycont = {}

   288             entry = []

   289             entrytype = pubtype_rex.sub('\g<1>',line)

   290             entrytype = string.lower(entrytype)

   291             entryid   = pubtype_rex.sub('\g<2>', line)

   293         # end entry if just a }

   294         elif endtype_rex.match(line):

   295             # generate doxygen code for the entry

   297             # enty type related formattings

   298             if entrytype in ('book', 'inbook'):

   299                 entrycont['title'] = '<em>' + entrycont['title'] + '</em>'

   300                 if not entrycont.has_key('author'):

   301                     entrycont['author'] = entrycont['editor']

   302                     entrycont['author']['text'] += ', editors'

   303             elif entrytype == 'article':

   304                 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>'

   305             elif entrytype in ('inproceedings', 'incollection', 'conference'):

   306                 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>'

   307             elif entrytype == 'techreport':

   308                 if not entrycont.has_key('type'):

   309                     entrycont['type'] = 'Technical report'

   310             elif entrytype == 'mastersthesis':

   311                 entrycont['type'] = 'Master\'s thesis'

   312             elif entrytype == 'phdthesis':

   313                 entrycont['type'] = 'PhD thesis'

   315             for eline in entrycont:

   316                 if eline != '':

   317                     eline = latexreplacements(eline)

   319             if entrycont.has_key('pages') and (entrycont['pages'] != ''):

   320                 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-')

   322             if entrycont.has_key('author') and (entrycont['author'] != ''):

   323                 entry.append(entrycont['author']['text'] + '.')

   324             if entrycont.has_key('title') and (entrycont['title'] != ''):

   325                 entry.append(entrycont['title'] + '.')

   326             if entrycont.has_key('journal') and (entrycont['journal'] != ''):

   327                 entry.append(entrycont['journal'] + ',')

   328             if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''):

   329                 entry.append('In ' + entrycont['booktitle'] + ',')

   330             if entrycont.has_key('type') and (entrycont['type'] != ''):

   331                 eline = entrycont['type']

   332                 if entrycont.has_key('number') and (entrycont['number'] != ''):

   333                     eline += ' ' + entrycont['number']

   334                 eline += ','

   335                 entry.append(eline)

   336             if entrycont.has_key('institution') and (entrycont['institution'] != ''):

   337                 entry.append(entrycont['institution'] + ',')

   338             if entrycont.has_key('publisher') and (entrycont['publisher'] != ''):

   339                 entry.append(entrycont['publisher'] + ',')

   340             if entrycont.has_key('school') and (entrycont['school'] != ''):

   341                 entry.append(entrycont['school'] + ',')

   342             if entrycont.has_key('address') and (entrycont['address'] != ''):

   343                 entry.append(entrycont['address'] + ',')

   344             if entrycont.has_key('edition') and (entrycont['edition'] != ''):

   345                 entry.append(entrycont['edition'] + ' edition,')

   346             if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''):

   347                 entry.append(entrycont['howpublished'] + ',')

   348             if entrycont.has_key('volume') and (entrycont['volume'] != ''):

   349                 eline = entrycont['volume'];

   350                 if entrycont.has_key('number') and (entrycont['number'] != ''):

   351                     eline += '(' + entrycont['number'] + ')'

   352                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):

   353                     eline += ':' + entrycont['pages']

   354                 eline += ','

   355                 entry.append(eline)

   356             else:

   357                 if entrycont.has_key('pages') and (entrycont['pages'] != ''):

   358                     entry.append('pages ' + entrycont['pages'] + ',')

   359             if entrycont.has_key('year') and (entrycont['year'] != ''):

   360                 if entrycont.has_key('month') and (entrycont['month'] != ''):

   361                     entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.')

   362                 else:

   363                     entry.append(entrycont['year'] + '.')

   364             if entrycont.has_key('note') and (entrycont['note'] != ''):

   365                 entry.append(entrycont['note'] + '.')

   367             # generate keys for sorting and for the output

   368             sortkey = ''

   369             bibkey = ''

   370             if entrycont.has_key('author'):

   371                 for author in entrycont['author']['list']:

   372                     sortkey += copychars(author, author.rfind(' ')+1, len(author))

   373                 bibkey = entrycont['author']['abbrev']

   374             else:

   375                 bibkey = 'x'

   376             if entrycont.has_key('year'):

   377                 sortkey += entrycont['year']

   378                 bibkey += entrycont['year'][-2:]

   379             if entrycont.has_key('title'):

   380                 sortkey += entrycont['title']

   381             if entrycont.has_key('key'):

   382                 sortkey = entrycont['key'] + sortkey

   383                 bibkey = entrycont['key']

   384             entry.insert(0, sortkey)

   385             entry.insert(1, bibkey)

   386             entry.insert(2, entryid)

   388             # add the entry to the file contents

   389             filecont.append(entry)

   391         else:

   392             # field, publication info

   393             field = ''

   394             data = ''

   396             # field = {data} entries

   397             if bracedata_rex.match(line):

   398                 field = bracefield_rex.sub('\g<1>', line)

   399                 field = string.lower(field)

   400                 data =  bracedata_rex.sub('\g<2>', line)

   402             # field = "data" entries

   403             elif quotedata_rex.match(line):

   404                 field = quotefield_rex.sub('\g<1>', line)

   405                 field = string.lower(field)

   406                 data =  quotedata_rex.sub('\g<2>', line)

   408             # field = data entries

   409             elif data_rex.match(line):

   410                 field = field_rex.sub('\g<1>', line)

   411                 field = string.lower(field)

   412                 data =  data_rex.sub('\g<2>', line)

   414             if field in ('author', 'editor'):

   415                 entrycont[field] = bibtexauthor(data)

   416                 line = ''

   417             elif field == 'title':

   418                 line = bibtextitle(data, entrytype)

   419             elif field != '':

   420                 line = removebraces(transformurls(data.strip()))

   422             if line != '':

   423                 line = latexreplacements(line)

   424                 entrycont[field] = line

   427     # sort entries

   428     filecont.sort(entry_cmp)

   430     # count the bibtex keys

   431     keytable = {}

   432     counttable = {}

   433     for entry in filecont:

   434         bibkey = entry[1]

   435         if not keytable.has_key(bibkey):

   436             keytable[bibkey] = 1

   437         else:

   438             keytable[bibkey] += 1

   440     for bibkey in keytable.keys():

   441         counttable[bibkey] = 0

   443     # generate output

   444     for entry in filecont:

   445         # generate output key form the bibtex key

   446         bibkey = entry[1]

   447         entryid = entry[2]

   448         if keytable[bibkey] == 1:

   449             outkey = bibkey

   450         else:

   451             outkey = bibkey + chr(97 + counttable[bibkey])

   452         counttable[bibkey] += 1

   454         # append the entry code to the output

   455         file.append('\\section ' + entryid + ' [' + outkey + ']')

   456         file.append('<div style="' + divstyle + '">')

   457         for line in entry[3:]:

   458             file.append(line)

   459         file.append('</div>')

   460         file.append('')

   462     return file

   465 #

   466 # return 1 iff abbr is in line but not inside braces or quotes

   467 # assumes that abbr appears only once on the line (out of braces and quotes)

   468 #

   469 def verify_out_of_braces(line, abbr):

   471     phrase_split = delimiter_rex.split(line)

   473     abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)

   475     open_brace = 0

   476     open_quote = 0

   478     for phrase in phrase_split:

   479         if phrase == "{":

   480             open_brace = open_brace + 1

   481         elif phrase == "}":

   482             open_brace = open_brace - 1

   483         elif phrase == '"':

   484             if open_quote == 1:

   485                 open_quote = 0

   486             else:

   487                 open_quote = 1

   488         elif abbr_rex.search(phrase):

   489             if open_brace == 0 and open_quote == 0:

   490                 return 1

   492     return 0

   495 #

   496 # a line in the form phrase1 # phrase2 # ... # phrasen

   497 # is returned as phrase1 phrase2 ... phrasen

   498 # with the correct punctuation

   499 # Bug: Doesn't always work with multiple abbreviations plugged in

   500 #

   501 def concat_line(line):

   502     # only look at part after equals

   503     field = field_rex.sub('\g<1>',line)

   504     rest = field_rex.sub('\g<2>',line)

   506     concat_line = field + ' ='

   508     pound_split = concatsplit_rex.split(rest)

   510     phrase_count = 0

   511     length = len(pound_split)

   513     for phrase in pound_split:

   514         phrase = phrase.strip()

   515         if phrase_count != 0:

   516             if phrase.startswith('"') or phrase.startswith('{'):

   517                 phrase = phrase[1:]

   518         elif phrase.startswith('"'):

   519             phrase = phrase.replace('"','{',1)

   521         if phrase_count != length-1:

   522             if phrase.endswith('"') or phrase.endswith('}'):

   523                 phrase = phrase[:-1]

   524         else:

   525             if phrase.endswith('"'):

   526                 phrase = phrase[:-1]

   527                 phrase = phrase + "}"

   528             elif phrase.endswith('",'):

   529                 phrase = phrase[:-2]

   530                 phrase = phrase + "},"

   532         # if phrase did have \#, add the \# back

   533         if phrase.endswith('\\'):

   534             phrase = phrase + "#"

   535         concat_line = concat_line + ' ' + phrase

   537         phrase_count = phrase_count + 1

   539     return concat_line

   542 #

   543 # substitute abbreviations into filecont

   544 # @param filecont_source - string of data from file

   545 #

   546 def bibtex_replace_abbreviations(filecont_source):

   547     filecont = filecont_source.splitlines()

   549     #  These are defined in bibtex, so we'll define them too

   550     abbr_list = ['jan','feb','mar','apr','may','jun',

   551                  'jul','aug','sep','oct','nov','dec']

   552     value_list = ['January','February','March','April',

   553                   'May','June','July','August','September',

   554                   'October','November','December']

   556     abbr_rex = []

   557     total_abbr_count = 0

   559     front = '\\b'

   560     back = '(,?)\\b'

   562     for x in abbr_list:

   563         abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )

   564         total_abbr_count = total_abbr_count + 1

   567     abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)',

   568                              re.I)

   570     comment_rex = re.compile('@comment\s*{',re.I)

   571     preamble_rex = re.compile('@preamble\s*{',re.I)

   573     waiting_for_end_string = 0

   574     i = 0

   575     filecont2 = ''

   577     for line in filecont:

   578         if line == ' ' or line == '':

   579             continue

   581         if waiting_for_end_string:

   582             if re.search('}',line):

   583                 waiting_for_end_string = 0

   584                 continue

   586         if abbrdef_rex.search(line):

   587             abbr = abbrdef_rex.sub('\g<1>', line)

   589             if abbr_list.count(abbr) == 0:

   590                 val = abbrdef_rex.sub('\g<2>', line)

   591                 abbr_list.append(abbr)

   592                 value_list.append(string.strip(val))

   593                 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) )

   594                 total_abbr_count = total_abbr_count + 1

   595             waiting_for_end_string = 1

   596             continue

   598         if comment_rex.search(line):

   599             waiting_for_end_string = 1

   600             continue

   602         if preamble_rex.search(line):

   603             waiting_for_end_string = 1

   604             continue

   607         # replace subsequent abbreviations with the value

   608         abbr_count = 0

   610         for x in abbr_list:

   612             if abbr_rex[abbr_count].search(line):

   613                 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:

   614                     line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line)

   615                 # Check for # concatenations

   616                 if concatsplit_rex.search(line):

   617                     line = concat_line(line)

   618             abbr_count = abbr_count + 1

   621         filecont2 = filecont2 + line + '\n'

   622         i = i+1

   625     # Do one final pass over file

   627     # make sure that didn't end up with {" or }" after the substitution

   628     filecont2 = filecont2.replace('{"','{{')

   629     filecont2 = filecont2.replace('"}','}}')

   631     afterquotevalue_rex = re.compile('"\s*,\s*')

   632     afterbrace_rex = re.compile('"\s*}')

   633     afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')

   635     # add new lines to data that changed because of abbreviation substitutions

   636     filecont2 = afterquotevalue_rex.sub('",\n', filecont2)

   637     filecont2 = afterbrace_rex.sub('"\n}', filecont2)

   638     filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2)

   640     return filecont2

   642 #

   643 # convert @type( ... ) to @type{ ... }

   644 #

   645 def no_outer_parens(filecont):

   647     # do checking for open parens

   648     # will convert to braces

   649     paren_split = re.split('([(){}])',filecont)

   651     open_paren_count = 0

   652     open_type = 0

   653     look_next = 0

   655     # rebuild filecont

   656     filecont = ''

   658     at_rex = re.compile('@\w*')

   660     for phrase in paren_split:

   661         if look_next == 1:

   662             if phrase == '(':

   663                 phrase = '{'

   664                 open_paren_count = open_paren_count + 1

   665             else:

   666                 open_type = 0

   667             look_next = 0

   669         if phrase == '(':

   670             open_paren_count = open_paren_count + 1

   672         elif phrase == ')':

   673             open_paren_count = open_paren_count - 1

   674             if open_type == 1 and open_paren_count == 0:

   675                 phrase = '}'

   676                 open_type = 0

   678         elif at_rex.search( phrase ):

   679             open_type = 1

   680             look_next = 1

   682         filecont = filecont + phrase

   684     return filecont

   687 #

   688 # make all whitespace into just one space

   689 # format the bibtex file into a usable form.

   690 #

   691 def bibtexwasher(filecont_source):

   693     space_rex = re.compile('\s+')

   694     comment_rex = re.compile('\s*%')

   696     filecont = []

   698     # remove trailing and excessive whitespace

   699     # ignore comments

   700     for line in filecont_source:

   701         line = string.strip(line)

   702         line = space_rex.sub(' ', line)

   703         # ignore comments

   704         if not comment_rex.match(line) and line != '':

   705             filecont.append(' '+ line)

   707     filecont = string.join(filecont, '')

   709     # the file is in one long string

   711     filecont = no_outer_parens(filecont)

   713     #

   714     # split lines according to preferred syntax scheme

   715     #

   716     filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont)

   718     # add new lines after commas that are after values

   719     filecont = re.sub('"\s*,', '",\n', filecont)

   720     filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont)

   721     filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',

   722                           '\n\n\g<1>\g<2>,\n', filecont)

   724     # add new lines after }

   725     filecont = re.sub('"\s*}','"\n}\n', filecont)

   726     filecont = re.sub('}\s*,','},\n', filecont)

   729     filecont = re.sub('@(\w*)', '\n@\g<1>', filecont)

   731     # character encoding, reserved latex characters

   732     filecont = re.sub('{\\\&}', '&', filecont)

   733     filecont = re.sub('\\\&', '&', filecont)

   735     # do checking for open braces to get format correct

   736     open_brace_count = 0

   737     brace_split = re.split('([{}])',filecont)

   739     # rebuild filecont

   740     filecont = ''

   742     for phrase in brace_split:

   743         if phrase == '{':

   744             open_brace_count = open_brace_count + 1

   745         elif phrase == '}':

   746             open_brace_count = open_brace_count - 1

   747             if open_brace_count == 0:

   748                 filecont = filecont + '\n'

   750         filecont = filecont + phrase

   752     filecont2 = bibtex_replace_abbreviations(filecont)

   754     # gather

   755     filecont = filecont2.splitlines()

   756     i=0

   757     j=0         # count the number of blank lines

   758     for line in filecont:

   759         # ignore blank lines

   760         if line == '' or line == ' ':

   761             j = j+1

   762             continue

   763         filecont[i] = line + '\n'

   764         i = i+1

   766     # get rid of the extra stuff at the end of the array

   767     # (The extra stuff are duplicates that are in the array because

   768     # blank lines were removed.)

   769     length = len( filecont)

   770     filecont[length-j:length] = []

   772     return filecont

   775 def filehandler(filepath):

   776     try:

   777         fd = open(filepath, 'r')

   778         filecont_source = fd.readlines()

   779         fd.close()

   780     except:

   781         print 'Could not open file:', filepath

   782     washeddata = bibtexwasher(filecont_source)

   783     outdata = bibtexdecoder(washeddata)

   784     print '/**'

   785     print '\page references References'

   786     print

   787     for line in outdata:

   788         print line

   789     print '*/'

   792 # main program

   794 def main():

   795     import sys

   796     if sys.argv[1:]:

   797         filepath = sys.argv[1]

   798     else:

   799         print "No input file"

   800         sys.exit()

   801     filehandler(filepath)

   803 if __name__ == "__main__": main()

   806 # end python script

author	Balazs Dezso <deba@inf.elte.hu>
	Sun, 04 Oct 2009 00:28:42 +0200
changeset 746	e4554cd6b2bf
parent 743	94ef0a5c0005
child 754	2de0fc630899
permissions	-rw-r--r--