1 #! /usr/bin/env python |
|
2 """ |
|
3 BibTeX to Doxygen converter |
|
4 Usage: python bib2dox.py bibfile.bib > bibfile.dox |
|
5 |
|
6 This file is a part of LEMON, a generic C++ optimization library. |
|
7 |
|
8 ********************************************************************** |
|
9 |
|
10 This code is the modification of the BibTeX to XML converter |
|
11 by Vidar Bronken Gundersen et al. |
|
12 See the original copyright notices below. |
|
13 |
|
14 ********************************************************************** |
|
15 |
|
16 Decoder for bibliographic data, BibTeX |
|
17 Usage: python bibtex2xml.py bibfile.bib > bibfile.xml |
|
18 |
|
19 v.8 |
|
20 (c)2002-06-23 Vidar Bronken Gundersen |
|
21 http://bibtexml.sf.net/ |
|
22 Reuse approved as long as this notification is kept. |
|
23 Licence: GPL. |
|
24 |
|
25 Contributions/thanks to: |
|
26 Egon Willighagen, http://sf.net/projects/jreferences/ |
|
27 Richard Mahoney (for providing a test case) |
|
28 |
|
29 Editted by Sara Sprenkle to be more robust and handle more bibtex features. |
|
30 (c) 2003-01-15 |
|
31 |
|
32 1. Changed bibtex: tags to bibxml: tags. |
|
33 2. Use xmlns:bibxml="http://bibtexml.sf.net/" |
|
34 3. Allow spaces between @type and first { |
|
35 4. "author" fields with multiple authors split by " and " |
|
36 are put in separate xml "bibxml:author" tags. |
|
37 5. Option for Titles: words are capitalized |
|
38 only if first letter in title or capitalized inside braces |
|
39 6. Removes braces from within field values |
|
40 7. Ignores comments in bibtex file (including @comment{ or % ) |
|
41 8. Replaces some special latex tags, e.g., replaces ~ with ' ' |
|
42 9. Handles bibtex @string abbreviations |
|
43 --> includes bibtex's default abbreviations for months |
|
44 --> does concatenation of abbr # " more " and " more " # abbr |
|
45 10. Handles @type( ... ) or @type{ ... } |
|
46 11. The keywords field is split on , or ; and put into separate xml |
|
47 "bibxml:keywords" tags |
|
48 12. Ignores @preamble |
|
49 |
|
50 Known Limitations |
|
51 1. Does not transform Latex encoding like math mode and special |
|
52 latex symbols. |
|
53 2. Does not parse author fields into first and last names. |
|
54 E.g., It does not do anything special to an author whose name is |
|
55 in the form LAST_NAME, FIRST_NAME |
|
56 In "author" tag, will show up as |
|
57 <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author> |
|
58 3. Does not handle "crossref" fields other than to print |
|
59 <bibxml:crossref>...</bibxml:crossref> |
|
60 4. Does not inform user of the input's format errors. You just won't |
|
61 be able to transform the file later with XSL |
|
62 |
|
63 You will have to manually edit the XML output if you need to handle |
|
64 these (and unknown) limitations. |
|
65 |
|
66 """ |
|
67 |
|
68 import string, re |
|
69 |
|
70 # set of valid name characters |
|
71 valid_name_chars = '[\w\-:]' |
|
72 |
|
73 # |
|
74 # define global regular expression variables |
|
75 # |
|
76 author_rex = re.compile('\s+and\s+') |
|
77 rembraces_rex = re.compile('[{}]') |
|
78 capitalize_rex = re.compile('({[^}]*})') |
|
79 |
|
80 # used by bibtexkeywords(data) |
|
81 keywords_rex = re.compile('[,;]') |
|
82 |
|
83 # used by concat_line(line) |
|
84 concatsplit_rex = re.compile('\s*#\s*') |
|
85 |
|
86 # split on {, }, or " in verify_out_of_braces |
|
87 delimiter_rex = re.compile('([{}"])',re.I) |
|
88 |
|
89 field_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
|
90 data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?') |
|
91 |
|
92 url_rex = re.compile('\\\url\{([^}]*)\}') |
|
93 |
|
94 # |
|
95 # styles for html formatting |
|
96 # |
|
97 divstyle = 'margin-top: -4ex; margin-left: 8em;' |
|
98 |
|
99 # |
|
100 # return the string parameter without braces |
|
101 # |
|
102 def transformurls(str): |
|
103 return url_rex.sub(r'<a href="\1">\1</a>', str) |
|
104 |
|
105 # |
|
106 # return the string parameter without braces |
|
107 # |
|
108 def removebraces(str): |
|
109 return rembraces_rex.sub('', str) |
|
110 |
|
111 # |
|
112 # latex-specific replacements |
|
113 # (do this after braces were removed) |
|
114 # |
|
115 def latexreplacements(line): |
|
116 line = string.replace(line, '~', ' ') |
|
117 line = string.replace(line, '\\\'a', 'á') |
|
118 line = string.replace(line, '\\"a', 'ä') |
|
119 line = string.replace(line, '\\\'e', 'é') |
|
120 line = string.replace(line, '\\"e', 'ë') |
|
121 line = string.replace(line, '\\\'i', 'í') |
|
122 line = string.replace(line, '\\"i', 'ï') |
|
123 line = string.replace(line, '\\\'o', 'ó') |
|
124 line = string.replace(line, '\\"o', 'ö') |
|
125 line = string.replace(line, '\\\'u', 'ú') |
|
126 line = string.replace(line, '\\"u', 'ü') |
|
127 line = string.replace(line, '\\H o', 'õ') |
|
128 line = string.replace(line, '\\H u', 'ü') # ũ does not exist |
|
129 line = string.replace(line, '\\\'A', 'Á') |
|
130 line = string.replace(line, '\\"A', 'Ä') |
|
131 line = string.replace(line, '\\\'E', 'É') |
|
132 line = string.replace(line, '\\"E', 'Ë') |
|
133 line = string.replace(line, '\\\'I', 'Í') |
|
134 line = string.replace(line, '\\"I', 'Ï') |
|
135 line = string.replace(line, '\\\'O', 'Ó') |
|
136 line = string.replace(line, '\\"O', 'Ö') |
|
137 line = string.replace(line, '\\\'U', 'Ú') |
|
138 line = string.replace(line, '\\"U', 'Ü') |
|
139 line = string.replace(line, '\\H O', 'Õ') |
|
140 line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist |
|
141 |
|
142 return line |
|
143 |
|
144 # |
|
145 # copy characters form a string decoding html expressions (&xyz;) |
|
146 # |
|
147 def copychars(str, ifrom, count): |
|
148 result = '' |
|
149 i = ifrom |
|
150 c = 0 |
|
151 html_spec = False |
|
152 while (i < len(str)) and (c < count): |
|
153 if str[i] == '&': |
|
154 html_spec = True; |
|
155 if i+1 < len(str): |
|
156 result += str[i+1] |
|
157 c += 1 |
|
158 i += 2 |
|
159 else: |
|
160 if not html_spec: |
|
161 if ((str[i] >= 'A') and (str[i] <= 'Z')) or \ |
|
162 ((str[i] >= 'a') and (str[i] <= 'z')): |
|
163 result += str[i] |
|
164 c += 1 |
|
165 elif str[i] == ';': |
|
166 html_spec = False; |
|
167 i += 1 |
|
168 |
|
169 return result |
|
170 |
|
171 |
|
172 # |
|
173 # Handle a list of authors (separated by 'and'). |
|
174 # It gives back an array of the follwing values: |
|
175 # - num: the number of authors, |
|
176 # - list: the list of the author names, |
|
177 # - text: the bibtex text (separated by commas and/or 'and') |
|
178 # - abbrev: abbreviation that can be used for indicate the |
|
179 # bibliography entries |
|
180 # |
|
181 def bibtexauthor(data): |
|
182 result = {} |
|
183 bibtex = '' |
|
184 result['list'] = author_rex.split(data) |
|
185 result['num'] = len(result['list']) |
|
186 for i, author in enumerate(result['list']): |
|
187 # general transformations |
|
188 author = latexreplacements(removebraces(author.strip())) |
|
189 # transform "Xyz, A. B." to "A. B. Xyz" |
|
190 pos = author.find(',') |
|
191 if pos != -1: |
|
192 author = author[pos+1:].strip() + ' ' + author[:pos].strip() |
|
193 result['list'][i] = author |
|
194 bibtex += author + '#' |
|
195 bibtex = bibtex[:-1] |
|
196 if result['num'] > 1: |
|
197 ix = bibtex.rfind('#') |
|
198 if result['num'] == 2: |
|
199 bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:] |
|
200 else: |
|
201 bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:] |
|
202 bibtex = bibtex.replace('#', ', ') |
|
203 result['text'] = bibtex |
|
204 |
|
205 result['abbrev'] = '' |
|
206 for author in result['list']: |
|
207 pos = author.rfind(' ') + 1 |
|
208 count = 1 |
|
209 if result['num'] == 1: |
|
210 count = 3 |
|
211 result['abbrev'] += copychars(author, pos, count) |
|
212 |
|
213 return result |
|
214 |
|
215 |
|
216 # |
|
217 # data = title string |
|
218 # @return the capitalized title (first letter is capitalized), rest are capitalized |
|
219 # only if capitalized inside braces |
|
220 # |
|
221 def capitalizetitle(data): |
|
222 title_list = capitalize_rex.split(data) |
|
223 title = '' |
|
224 count = 0 |
|
225 for phrase in title_list: |
|
226 check = string.lstrip(phrase) |
|
227 |
|
228 # keep phrase's capitalization the same |
|
229 if check.find('{') == 0: |
|
230 title += removebraces(phrase) |
|
231 else: |
|
232 # first word --> capitalize first letter (after spaces) |
|
233 if count == 0: |
|
234 title += check.capitalize() |
|
235 else: |
|
236 title += phrase.lower() |
|
237 count = count + 1 |
|
238 |
|
239 return title |
|
240 |
|
241 |
|
242 # |
|
243 # @return the bibtex for the title |
|
244 # @param data --> title string |
|
245 # braces are removed from title |
|
246 # |
|
247 def bibtextitle(data, entrytype): |
|
248 if entrytype in ('book', 'inbook'): |
|
249 title = removebraces(data.strip()) |
|
250 else: |
|
251 title = removebraces(capitalizetitle(data.strip())) |
|
252 bibtex = title |
|
253 return bibtex |
|
254 |
|
255 |
|
256 # |
|
257 # function to compare entry lists |
|
258 # |
|
259 def entry_cmp(x, y): |
|
260 return cmp(x[0], y[0]) |
|
261 |
|
262 |
|
263 # |
|
264 # print the XML for the transformed "filecont_source" |
|
265 # |
|
266 def bibtexdecoder(filecont_source): |
|
267 filecont = [] |
|
268 file = [] |
|
269 |
|
270 # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
|
271 pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),') |
|
272 endtype_rex = re.compile('}\s*$') |
|
273 endtag_rex = re.compile('^\s*}\s*$') |
|
274 |
|
275 bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
|
276 bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?') |
|
277 |
|
278 quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
|
279 quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?') |
|
280 |
|
281 for line in filecont_source: |
|
282 line = line[:-1] |
|
283 |
|
284 # encode character entities |
|
285 line = string.replace(line, '&', '&') |
|
286 line = string.replace(line, '<', '<') |
|
287 line = string.replace(line, '>', '>') |
|
288 |
|
289 # start entry: publication type (store for later use) |
|
290 if pubtype_rex.match(line): |
|
291 # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
|
292 entrycont = {} |
|
293 entry = [] |
|
294 entrytype = pubtype_rex.sub('\g<1>',line) |
|
295 entrytype = string.lower(entrytype) |
|
296 entryid = pubtype_rex.sub('\g<2>', line) |
|
297 |
|
298 # end entry if just a } |
|
299 elif endtype_rex.match(line): |
|
300 # generate doxygen code for the entry |
|
301 |
|
302 # enty type related formattings |
|
303 if entrytype in ('book', 'inbook'): |
|
304 entrycont['title'] = '<em>' + entrycont['title'] + '</em>' |
|
305 if not entrycont.has_key('author'): |
|
306 entrycont['author'] = entrycont['editor'] |
|
307 entrycont['author']['text'] += ', editors' |
|
308 elif entrytype == 'article': |
|
309 entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>' |
|
310 elif entrytype in ('inproceedings', 'incollection', 'conference'): |
|
311 entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>' |
|
312 elif entrytype == 'techreport': |
|
313 if not entrycont.has_key('type'): |
|
314 entrycont['type'] = 'Technical report' |
|
315 elif entrytype == 'mastersthesis': |
|
316 entrycont['type'] = 'Master\'s thesis' |
|
317 elif entrytype == 'phdthesis': |
|
318 entrycont['type'] = 'PhD thesis' |
|
319 |
|
320 for eline in entrycont: |
|
321 if eline != '': |
|
322 eline = latexreplacements(eline) |
|
323 |
|
324 if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
|
325 entrycont['pages'] = string.replace(entrycont['pages'], '--', '-') |
|
326 |
|
327 if entrycont.has_key('author') and (entrycont['author'] != ''): |
|
328 entry.append(entrycont['author']['text'] + '.') |
|
329 if entrycont.has_key('title') and (entrycont['title'] != ''): |
|
330 entry.append(entrycont['title'] + '.') |
|
331 if entrycont.has_key('journal') and (entrycont['journal'] != ''): |
|
332 entry.append(entrycont['journal'] + ',') |
|
333 if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''): |
|
334 entry.append('In ' + entrycont['booktitle'] + ',') |
|
335 if entrycont.has_key('type') and (entrycont['type'] != ''): |
|
336 eline = entrycont['type'] |
|
337 if entrycont.has_key('number') and (entrycont['number'] != ''): |
|
338 eline += ' ' + entrycont['number'] |
|
339 eline += ',' |
|
340 entry.append(eline) |
|
341 if entrycont.has_key('institution') and (entrycont['institution'] != ''): |
|
342 entry.append(entrycont['institution'] + ',') |
|
343 if entrycont.has_key('publisher') and (entrycont['publisher'] != ''): |
|
344 entry.append(entrycont['publisher'] + ',') |
|
345 if entrycont.has_key('school') and (entrycont['school'] != ''): |
|
346 entry.append(entrycont['school'] + ',') |
|
347 if entrycont.has_key('address') and (entrycont['address'] != ''): |
|
348 entry.append(entrycont['address'] + ',') |
|
349 if entrycont.has_key('edition') and (entrycont['edition'] != ''): |
|
350 entry.append(entrycont['edition'] + ' edition,') |
|
351 if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''): |
|
352 entry.append(entrycont['howpublished'] + ',') |
|
353 if entrycont.has_key('volume') and (entrycont['volume'] != ''): |
|
354 eline = entrycont['volume']; |
|
355 if entrycont.has_key('number') and (entrycont['number'] != ''): |
|
356 eline += '(' + entrycont['number'] + ')' |
|
357 if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
|
358 eline += ':' + entrycont['pages'] |
|
359 eline += ',' |
|
360 entry.append(eline) |
|
361 else: |
|
362 if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
|
363 entry.append('pages ' + entrycont['pages'] + ',') |
|
364 if entrycont.has_key('year') and (entrycont['year'] != ''): |
|
365 if entrycont.has_key('month') and (entrycont['month'] != ''): |
|
366 entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.') |
|
367 else: |
|
368 entry.append(entrycont['year'] + '.') |
|
369 if entrycont.has_key('note') and (entrycont['note'] != ''): |
|
370 entry.append(entrycont['note'] + '.') |
|
371 if entrycont.has_key('url') and (entrycont['url'] != ''): |
|
372 entry.append(entrycont['url'] + '.') |
|
373 |
|
374 # generate keys for sorting and for the output |
|
375 sortkey = '' |
|
376 bibkey = '' |
|
377 if entrycont.has_key('author'): |
|
378 for author in entrycont['author']['list']: |
|
379 sortkey += copychars(author, author.rfind(' ')+1, len(author)) |
|
380 bibkey = entrycont['author']['abbrev'] |
|
381 else: |
|
382 bibkey = 'x' |
|
383 if entrycont.has_key('year'): |
|
384 sortkey += entrycont['year'] |
|
385 bibkey += entrycont['year'][-2:] |
|
386 if entrycont.has_key('title'): |
|
387 sortkey += entrycont['title'] |
|
388 if entrycont.has_key('key'): |
|
389 sortkey = entrycont['key'] + sortkey |
|
390 bibkey = entrycont['key'] |
|
391 entry.insert(0, sortkey) |
|
392 entry.insert(1, bibkey) |
|
393 entry.insert(2, entryid) |
|
394 |
|
395 # add the entry to the file contents |
|
396 filecont.append(entry) |
|
397 |
|
398 else: |
|
399 # field, publication info |
|
400 field = '' |
|
401 data = '' |
|
402 |
|
403 # field = {data} entries |
|
404 if bracedata_rex.match(line): |
|
405 field = bracefield_rex.sub('\g<1>', line) |
|
406 field = string.lower(field) |
|
407 data = bracedata_rex.sub('\g<2>', line) |
|
408 |
|
409 # field = "data" entries |
|
410 elif quotedata_rex.match(line): |
|
411 field = quotefield_rex.sub('\g<1>', line) |
|
412 field = string.lower(field) |
|
413 data = quotedata_rex.sub('\g<2>', line) |
|
414 |
|
415 # field = data entries |
|
416 elif data_rex.match(line): |
|
417 field = field_rex.sub('\g<1>', line) |
|
418 field = string.lower(field) |
|
419 data = data_rex.sub('\g<2>', line) |
|
420 |
|
421 if field == 'url': |
|
422 data = '\\url{' + data.strip() + '}' |
|
423 |
|
424 if field in ('author', 'editor'): |
|
425 entrycont[field] = bibtexauthor(data) |
|
426 line = '' |
|
427 elif field == 'title': |
|
428 line = bibtextitle(data, entrytype) |
|
429 elif field != '': |
|
430 line = removebraces(transformurls(data.strip())) |
|
431 |
|
432 if line != '': |
|
433 line = latexreplacements(line) |
|
434 entrycont[field] = line |
|
435 |
|
436 |
|
437 # sort entries |
|
438 filecont.sort(entry_cmp) |
|
439 |
|
440 # count the bibtex keys |
|
441 keytable = {} |
|
442 counttable = {} |
|
443 for entry in filecont: |
|
444 bibkey = entry[1] |
|
445 if not keytable.has_key(bibkey): |
|
446 keytable[bibkey] = 1 |
|
447 else: |
|
448 keytable[bibkey] += 1 |
|
449 |
|
450 for bibkey in keytable.keys(): |
|
451 counttable[bibkey] = 0 |
|
452 |
|
453 # generate output |
|
454 for entry in filecont: |
|
455 # generate output key form the bibtex key |
|
456 bibkey = entry[1] |
|
457 entryid = entry[2] |
|
458 if keytable[bibkey] == 1: |
|
459 outkey = bibkey |
|
460 else: |
|
461 outkey = bibkey + chr(97 + counttable[bibkey]) |
|
462 counttable[bibkey] += 1 |
|
463 |
|
464 # append the entry code to the output |
|
465 file.append('\\section ' + entryid + ' [' + outkey + ']') |
|
466 file.append('<div style="' + divstyle + '">') |
|
467 for line in entry[3:]: |
|
468 file.append(line) |
|
469 file.append('</div>') |
|
470 file.append('') |
|
471 |
|
472 return file |
|
473 |
|
474 |
|
475 # |
|
476 # return 1 iff abbr is in line but not inside braces or quotes |
|
477 # assumes that abbr appears only once on the line (out of braces and quotes) |
|
478 # |
|
479 def verify_out_of_braces(line, abbr): |
|
480 |
|
481 phrase_split = delimiter_rex.split(line) |
|
482 |
|
483 abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I) |
|
484 |
|
485 open_brace = 0 |
|
486 open_quote = 0 |
|
487 |
|
488 for phrase in phrase_split: |
|
489 if phrase == "{": |
|
490 open_brace = open_brace + 1 |
|
491 elif phrase == "}": |
|
492 open_brace = open_brace - 1 |
|
493 elif phrase == '"': |
|
494 if open_quote == 1: |
|
495 open_quote = 0 |
|
496 else: |
|
497 open_quote = 1 |
|
498 elif abbr_rex.search(phrase): |
|
499 if open_brace == 0 and open_quote == 0: |
|
500 return 1 |
|
501 |
|
502 return 0 |
|
503 |
|
504 |
|
505 # |
|
506 # a line in the form phrase1 # phrase2 # ... # phrasen |
|
507 # is returned as phrase1 phrase2 ... phrasen |
|
508 # with the correct punctuation |
|
509 # Bug: Doesn't always work with multiple abbreviations plugged in |
|
510 # |
|
511 def concat_line(line): |
|
512 # only look at part after equals |
|
513 field = field_rex.sub('\g<1>',line) |
|
514 rest = field_rex.sub('\g<2>',line) |
|
515 |
|
516 concat_line = field + ' =' |
|
517 |
|
518 pound_split = concatsplit_rex.split(rest) |
|
519 |
|
520 phrase_count = 0 |
|
521 length = len(pound_split) |
|
522 |
|
523 for phrase in pound_split: |
|
524 phrase = phrase.strip() |
|
525 if phrase_count != 0: |
|
526 if phrase.startswith('"') or phrase.startswith('{'): |
|
527 phrase = phrase[1:] |
|
528 elif phrase.startswith('"'): |
|
529 phrase = phrase.replace('"','{',1) |
|
530 |
|
531 if phrase_count != length-1: |
|
532 if phrase.endswith('"') or phrase.endswith('}'): |
|
533 phrase = phrase[:-1] |
|
534 else: |
|
535 if phrase.endswith('"'): |
|
536 phrase = phrase[:-1] |
|
537 phrase = phrase + "}" |
|
538 elif phrase.endswith('",'): |
|
539 phrase = phrase[:-2] |
|
540 phrase = phrase + "}," |
|
541 |
|
542 # if phrase did have \#, add the \# back |
|
543 if phrase.endswith('\\'): |
|
544 phrase = phrase + "#" |
|
545 concat_line = concat_line + ' ' + phrase |
|
546 |
|
547 phrase_count = phrase_count + 1 |
|
548 |
|
549 return concat_line |
|
550 |
|
551 |
|
552 # |
|
553 # substitute abbreviations into filecont |
|
554 # @param filecont_source - string of data from file |
|
555 # |
|
556 def bibtex_replace_abbreviations(filecont_source): |
|
557 filecont = filecont_source.splitlines() |
|
558 |
|
559 # These are defined in bibtex, so we'll define them too |
|
560 abbr_list = ['jan','feb','mar','apr','may','jun', |
|
561 'jul','aug','sep','oct','nov','dec'] |
|
562 value_list = ['January','February','March','April', |
|
563 'May','June','July','August','September', |
|
564 'October','November','December'] |
|
565 |
|
566 abbr_rex = [] |
|
567 total_abbr_count = 0 |
|
568 |
|
569 front = '\\b' |
|
570 back = '(,?)\\b' |
|
571 |
|
572 for x in abbr_list: |
|
573 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) |
|
574 total_abbr_count = total_abbr_count + 1 |
|
575 |
|
576 |
|
577 abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)', |
|
578 re.I) |
|
579 |
|
580 comment_rex = re.compile('@comment\s*{',re.I) |
|
581 preamble_rex = re.compile('@preamble\s*{',re.I) |
|
582 |
|
583 waiting_for_end_string = 0 |
|
584 i = 0 |
|
585 filecont2 = '' |
|
586 |
|
587 for line in filecont: |
|
588 if line == ' ' or line == '': |
|
589 continue |
|
590 |
|
591 if waiting_for_end_string: |
|
592 if re.search('}',line): |
|
593 waiting_for_end_string = 0 |
|
594 continue |
|
595 |
|
596 if abbrdef_rex.search(line): |
|
597 abbr = abbrdef_rex.sub('\g<1>', line) |
|
598 |
|
599 if abbr_list.count(abbr) == 0: |
|
600 val = abbrdef_rex.sub('\g<2>', line) |
|
601 abbr_list.append(abbr) |
|
602 value_list.append(string.strip(val)) |
|
603 abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) |
|
604 total_abbr_count = total_abbr_count + 1 |
|
605 waiting_for_end_string = 1 |
|
606 continue |
|
607 |
|
608 if comment_rex.search(line): |
|
609 waiting_for_end_string = 1 |
|
610 continue |
|
611 |
|
612 if preamble_rex.search(line): |
|
613 waiting_for_end_string = 1 |
|
614 continue |
|
615 |
|
616 |
|
617 # replace subsequent abbreviations with the value |
|
618 abbr_count = 0 |
|
619 |
|
620 for x in abbr_list: |
|
621 |
|
622 if abbr_rex[abbr_count].search(line): |
|
623 if verify_out_of_braces(line,abbr_list[abbr_count]) == 1: |
|
624 line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line) |
|
625 # Check for # concatenations |
|
626 if concatsplit_rex.search(line): |
|
627 line = concat_line(line) |
|
628 abbr_count = abbr_count + 1 |
|
629 |
|
630 |
|
631 filecont2 = filecont2 + line + '\n' |
|
632 i = i+1 |
|
633 |
|
634 |
|
635 # Do one final pass over file |
|
636 |
|
637 # make sure that didn't end up with {" or }" after the substitution |
|
638 filecont2 = filecont2.replace('{"','{{') |
|
639 filecont2 = filecont2.replace('"}','}}') |
|
640 |
|
641 afterquotevalue_rex = re.compile('"\s*,\s*') |
|
642 afterbrace_rex = re.compile('"\s*}') |
|
643 afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*') |
|
644 |
|
645 # add new lines to data that changed because of abbreviation substitutions |
|
646 filecont2 = afterquotevalue_rex.sub('",\n', filecont2) |
|
647 filecont2 = afterbrace_rex.sub('"\n}', filecont2) |
|
648 filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2) |
|
649 |
|
650 return filecont2 |
|
651 |
|
652 # |
|
653 # convert @type( ... ) to @type{ ... } |
|
654 # |
|
655 def no_outer_parens(filecont): |
|
656 |
|
657 # do checking for open parens |
|
658 # will convert to braces |
|
659 paren_split = re.split('([(){}])',filecont) |
|
660 |
|
661 open_paren_count = 0 |
|
662 open_type = 0 |
|
663 look_next = 0 |
|
664 |
|
665 # rebuild filecont |
|
666 filecont = '' |
|
667 |
|
668 at_rex = re.compile('@\w*') |
|
669 |
|
670 for phrase in paren_split: |
|
671 if look_next == 1: |
|
672 if phrase == '(': |
|
673 phrase = '{' |
|
674 open_paren_count = open_paren_count + 1 |
|
675 else: |
|
676 open_type = 0 |
|
677 look_next = 0 |
|
678 |
|
679 if phrase == '(': |
|
680 open_paren_count = open_paren_count + 1 |
|
681 |
|
682 elif phrase == ')': |
|
683 open_paren_count = open_paren_count - 1 |
|
684 if open_type == 1 and open_paren_count == 0: |
|
685 phrase = '}' |
|
686 open_type = 0 |
|
687 |
|
688 elif at_rex.search( phrase ): |
|
689 open_type = 1 |
|
690 look_next = 1 |
|
691 |
|
692 filecont = filecont + phrase |
|
693 |
|
694 return filecont |
|
695 |
|
696 |
|
697 # |
|
698 # make all whitespace into just one space |
|
699 # format the bibtex file into a usable form. |
|
700 # |
|
701 def bibtexwasher(filecont_source): |
|
702 |
|
703 space_rex = re.compile('\s+') |
|
704 comment_rex = re.compile('\s*%') |
|
705 |
|
706 filecont = [] |
|
707 |
|
708 # remove trailing and excessive whitespace |
|
709 # ignore comments |
|
710 for line in filecont_source: |
|
711 line = string.strip(line) |
|
712 line = space_rex.sub(' ', line) |
|
713 # ignore comments |
|
714 if not comment_rex.match(line) and line != '': |
|
715 filecont.append(' '+ line) |
|
716 |
|
717 filecont = string.join(filecont, '') |
|
718 |
|
719 # the file is in one long string |
|
720 |
|
721 filecont = no_outer_parens(filecont) |
|
722 |
|
723 # |
|
724 # split lines according to preferred syntax scheme |
|
725 # |
|
726 filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont) |
|
727 |
|
728 # add new lines after commas that are after values |
|
729 filecont = re.sub('"\s*,', '",\n', filecont) |
|
730 filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont) |
|
731 filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,', |
|
732 '\n\n\g<1>\g<2>,\n', filecont) |
|
733 |
|
734 # add new lines after } |
|
735 filecont = re.sub('"\s*}','"\n}\n', filecont) |
|
736 filecont = re.sub('}\s*,','},\n', filecont) |
|
737 |
|
738 |
|
739 filecont = re.sub('@(\w*)', '\n@\g<1>', filecont) |
|
740 |
|
741 # character encoding, reserved latex characters |
|
742 filecont = re.sub('{\\\&}', '&', filecont) |
|
743 filecont = re.sub('\\\&', '&', filecont) |
|
744 |
|
745 # do checking for open braces to get format correct |
|
746 open_brace_count = 0 |
|
747 brace_split = re.split('([{}])',filecont) |
|
748 |
|
749 # rebuild filecont |
|
750 filecont = '' |
|
751 |
|
752 for phrase in brace_split: |
|
753 if phrase == '{': |
|
754 open_brace_count = open_brace_count + 1 |
|
755 elif phrase == '}': |
|
756 open_brace_count = open_brace_count - 1 |
|
757 if open_brace_count == 0: |
|
758 filecont = filecont + '\n' |
|
759 |
|
760 filecont = filecont + phrase |
|
761 |
|
762 filecont2 = bibtex_replace_abbreviations(filecont) |
|
763 |
|
764 # gather |
|
765 filecont = filecont2.splitlines() |
|
766 i=0 |
|
767 j=0 # count the number of blank lines |
|
768 for line in filecont: |
|
769 # ignore blank lines |
|
770 if line == '' or line == ' ': |
|
771 j = j+1 |
|
772 continue |
|
773 filecont[i] = line + '\n' |
|
774 i = i+1 |
|
775 |
|
776 # get rid of the extra stuff at the end of the array |
|
777 # (The extra stuff are duplicates that are in the array because |
|
778 # blank lines were removed.) |
|
779 length = len( filecont) |
|
780 filecont[length-j:length] = [] |
|
781 |
|
782 return filecont |
|
783 |
|
784 |
|
785 def filehandler(filepath): |
|
786 try: |
|
787 fd = open(filepath, 'r') |
|
788 filecont_source = fd.readlines() |
|
789 fd.close() |
|
790 except: |
|
791 print 'Could not open file:', filepath |
|
792 washeddata = bibtexwasher(filecont_source) |
|
793 outdata = bibtexdecoder(washeddata) |
|
794 print '/**' |
|
795 print '\page references References' |
|
796 print |
|
797 for line in outdata: |
|
798 print line |
|
799 print '*/' |
|
800 |
|
801 |
|
802 # main program |
|
803 |
|
804 def main(): |
|
805 import sys |
|
806 if sys.argv[1:]: |
|
807 filepath = sys.argv[1] |
|
808 else: |
|
809 print "No input file" |
|
810 sys.exit() |
|
811 filehandler(filepath) |
|
812 |
|
813 if __name__ == "__main__": main() |
|
814 |
|
815 |
|
816 # end python script |
|