1 | | #! /usr/bin/env python |
2 | | """ |
3 | | BibTeX to Doxygen converter |
4 | | Usage: python bib2dox.py bibfile.bib > bibfile.dox |
5 | | |
6 | | This file is a part of LEMON, a generic C++ optimization library. |
7 | | |
8 | | ********************************************************************** |
9 | | |
10 | | This code is the modification of the BibTeX to XML converter |
11 | | by Vidar Bronken Gundersen et al. |
12 | | See the original copyright notices below. |
13 | | |
14 | | ********************************************************************** |
15 | | |
16 | | Decoder for bibliographic data, BibTeX |
17 | | Usage: python bibtex2xml.py bibfile.bib > bibfile.xml |
18 | | |
19 | | v.8 |
20 | | (c)2002-06-23 Vidar Bronken Gundersen |
21 | | http://bibtexml.sf.net/ |
22 | | Reuse approved as long as this notification is kept. |
23 | | Licence: GPL. |
24 | | |
25 | | Contributions/thanks to: |
26 | | Egon Willighagen, http://sf.net/projects/jreferences/ |
27 | | Richard Mahoney (for providing a test case) |
28 | | |
29 | | Editted by Sara Sprenkle to be more robust and handle more bibtex features. |
30 | | (c) 2003-01-15 |
31 | | |
32 | | 1. Changed bibtex: tags to bibxml: tags. |
33 | | 2. Use xmlns:bibxml="http://bibtexml.sf.net/" |
34 | | 3. Allow spaces between @type and first { |
35 | | 4. "author" fields with multiple authors split by " and " |
36 | | are put in separate xml "bibxml:author" tags. |
37 | | 5. Option for Titles: words are capitalized |
38 | | only if first letter in title or capitalized inside braces |
39 | | 6. Removes braces from within field values |
40 | | 7. Ignores comments in bibtex file (including @comment{ or % ) |
41 | | 8. Replaces some special latex tags, e.g., replaces ~ with ' ' |
42 | | 9. Handles bibtex @string abbreviations |
43 | | --> includes bibtex's default abbreviations for months |
44 | | --> does concatenation of abbr # " more " and " more " # abbr |
45 | | 10. Handles @type( ... ) or @type{ ... } |
46 | | 11. The keywords field is split on , or ; and put into separate xml |
47 | | "bibxml:keywords" tags |
48 | | 12. Ignores @preamble |
49 | | |
50 | | Known Limitations |
51 | | 1. Does not transform Latex encoding like math mode and special |
52 | | latex symbols. |
53 | | 2. Does not parse author fields into first and last names. |
54 | | E.g., It does not do anything special to an author whose name is |
55 | | in the form LAST_NAME, FIRST_NAME |
56 | | In "author" tag, will show up as |
57 | | <bibxml:author>LAST_NAME, FIRST_NAME</bibxml:author> |
58 | | 3. Does not handle "crossref" fields other than to print |
59 | | <bibxml:crossref>...</bibxml:crossref> |
60 | | 4. Does not inform user of the input's format errors. You just won't |
61 | | be able to transform the file later with XSL |
62 | | |
63 | | You will have to manually edit the XML output if you need to handle |
64 | | these (and unknown) limitations. |
65 | | |
66 | | """ |
67 | | |
68 | | import string, re |
69 | | |
70 | | # set of valid name characters |
71 | | valid_name_chars = '[\w\-:]' |
72 | | |
73 | | # |
74 | | # define global regular expression variables |
75 | | # |
76 | | author_rex = re.compile('\s+and\s+') |
77 | | rembraces_rex = re.compile('[{}]') |
78 | | capitalize_rex = re.compile('({[^}]*})') |
79 | | |
80 | | # used by bibtexkeywords(data) |
81 | | keywords_rex = re.compile('[,;]') |
82 | | |
83 | | # used by concat_line(line) |
84 | | concatsplit_rex = re.compile('\s*#\s*') |
85 | | |
86 | | # split on {, }, or " in verify_out_of_braces |
87 | | delimiter_rex = re.compile('([{}"])',re.I) |
88 | | |
89 | | field_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
90 | | data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?') |
91 | | |
92 | | url_rex = re.compile('\\\url\{([^}]*)\}') |
93 | | |
94 | | # |
95 | | # styles for html formatting |
96 | | # |
97 | | divstyle = 'margin-top: -4ex; margin-left: 10em;' |
98 | | |
99 | | # |
100 | | # return the string parameter without braces |
101 | | # |
102 | | def transformurls(str): |
103 | | return url_rex.sub(r'<a href="\1">\1</a>', str) |
104 | | |
105 | | # |
106 | | # return the string parameter without braces |
107 | | # |
108 | | def removebraces(str): |
109 | | return rembraces_rex.sub('', str) |
110 | | |
111 | | # |
112 | | # latex-specific replacements |
113 | | # (do this after braces were removed) |
114 | | # |
115 | | def latexreplacements(line): |
116 | | line = string.replace(line, '~', ' ') |
117 | | line = string.replace(line, '\\\'a', 'á') |
118 | | line = string.replace(line, '\\"a', 'ä') |
119 | | line = string.replace(line, '\\\'e', 'é') |
120 | | line = string.replace(line, '\\"e', 'ë') |
121 | | line = string.replace(line, '\\\'i', 'í') |
122 | | line = string.replace(line, '\\"i', 'ï') |
123 | | line = string.replace(line, '\\\'o', 'ó') |
124 | | line = string.replace(line, '\\"o', 'ö') |
125 | | line = string.replace(line, '\\\'u', 'ú') |
126 | | line = string.replace(line, '\\"u', 'ü') |
127 | | line = string.replace(line, '\\H o', 'õ') |
128 | | line = string.replace(line, '\\H u', 'ü') # ũ does not exist |
129 | | line = string.replace(line, '\\\'A', 'Á') |
130 | | line = string.replace(line, '\\"A', 'Ä') |
131 | | line = string.replace(line, '\\\'E', 'É') |
132 | | line = string.replace(line, '\\"E', 'Ë') |
133 | | line = string.replace(line, '\\\'I', 'Í') |
134 | | line = string.replace(line, '\\"I', 'Ï') |
135 | | line = string.replace(line, '\\\'O', 'Ó') |
136 | | line = string.replace(line, '\\"O', 'Ö') |
137 | | line = string.replace(line, '\\\'U', 'Ú') |
138 | | line = string.replace(line, '\\"U', 'Ü') |
139 | | line = string.replace(line, '\\H O', 'Õ') |
140 | | line = string.replace(line, '\\H U', 'Ü') # Ũ does not exist |
141 | | |
142 | | return line |
143 | | |
144 | | # |
145 | | # copy characters form a string decoding html expressions (&xyz;) |
146 | | # |
147 | | def copychars(str, ifrom, count): |
148 | | result = '' |
149 | | i = ifrom |
150 | | c = 0 |
151 | | html_spec = False |
152 | | while (i < len(str)) and (c < count): |
153 | | if str[i] == '&': |
154 | | html_spec = True; |
155 | | if i+1 < len(str): |
156 | | result += str[i+1] |
157 | | c += 1 |
158 | | i += 2 |
159 | | else: |
160 | | if not html_spec: |
161 | | if ((str[i] >= 'A') and (str[i] <= 'Z')) or \ |
162 | | ((str[i] >= 'a') and (str[i] <= 'z')): |
163 | | result += str[i] |
164 | | c += 1 |
165 | | elif str[i] == ';': |
166 | | html_spec = False; |
167 | | i += 1 |
168 | | |
169 | | return result |
170 | | |
171 | | |
172 | | # |
173 | | # Handle a list of authors (separated by 'and'). |
174 | | # It gives back an array of the follwing values: |
175 | | # - num: the number of authors, |
176 | | # - list: the list of the author names, |
177 | | # - text: the bibtex text (separated by commas and/or 'and') |
178 | | # - abbrev: abbreviation that can be used for indicate the |
179 | | # bibliography entries |
180 | | # |
181 | | def bibtexauthor(data): |
182 | | result = {} |
183 | | bibtex = '' |
184 | | result['list'] = author_rex.split(data) |
185 | | result['num'] = len(result['list']) |
186 | | for i, author in enumerate(result['list']): |
187 | | # general transformations |
188 | | author = latexreplacements(removebraces(author.strip())) |
189 | | # transform "Xyz, A. B." to "A. B. Xyz" |
190 | | pos = author.find(',') |
191 | | if pos != -1: |
192 | | author = author[pos+1:].strip() + ' ' + author[:pos].strip() |
193 | | result['list'][i] = author |
194 | | bibtex += author + '#' |
195 | | bibtex = bibtex[:-1] |
196 | | if result['num'] > 1: |
197 | | ix = bibtex.rfind('#') |
198 | | if result['num'] == 2: |
199 | | bibtex = bibtex[:ix] + ' and ' + bibtex[ix+1:] |
200 | | else: |
201 | | bibtex = bibtex[:ix] + ', and ' + bibtex[ix+1:] |
202 | | bibtex = bibtex.replace('#', ', ') |
203 | | result['text'] = bibtex |
204 | | |
205 | | result['abbrev'] = '' |
206 | | for author in result['list']: |
207 | | pos = author.rfind(' ') + 1 |
208 | | count = 1 |
209 | | if result['num'] == 1: |
210 | | count = 3 |
211 | | result['abbrev'] += copychars(author, pos, count) |
212 | | |
213 | | return result |
214 | | |
215 | | |
216 | | # |
217 | | # data = title string |
218 | | # @return the capitalized title (first letter is capitalized), rest are capitalized |
219 | | # only if capitalized inside braces |
220 | | # |
221 | | def capitalizetitle(data): |
222 | | title_list = capitalize_rex.split(data) |
223 | | title = '' |
224 | | count = 0 |
225 | | for phrase in title_list: |
226 | | check = string.lstrip(phrase) |
227 | | |
228 | | # keep phrase's capitalization the same |
229 | | if check.find('{') == 0: |
230 | | title += removebraces(phrase) |
231 | | else: |
232 | | # first word --> capitalize first letter (after spaces) |
233 | | if count == 0: |
234 | | title += check.capitalize() |
235 | | else: |
236 | | title += phrase.lower() |
237 | | count = count + 1 |
238 | | |
239 | | return title |
240 | | |
241 | | |
242 | | # |
243 | | # @return the bibtex for the title |
244 | | # @param data --> title string |
245 | | # braces are removed from title |
246 | | # |
247 | | def bibtextitle(data, entrytype): |
248 | | if entrytype in ('book', 'inbook'): |
249 | | title = removebraces(data.strip()) |
250 | | else: |
251 | | title = removebraces(capitalizetitle(data.strip())) |
252 | | bibtex = title |
253 | | return bibtex |
254 | | |
255 | | |
256 | | # |
257 | | # function to compare entry lists |
258 | | # |
259 | | def entry_cmp(x, y): |
260 | | return cmp(x[0], y[0]) |
261 | | |
262 | | |
263 | | # |
264 | | # print the XML for the transformed "filecont_source" |
265 | | # |
266 | | def bibtexdecoder(filecont_source): |
267 | | filecont = [] |
268 | | file = [] |
269 | | |
270 | | # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
271 | | pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),') |
272 | | endtype_rex = re.compile('}\s*$') |
273 | | endtag_rex = re.compile('^\s*}\s*$') |
274 | | |
275 | | bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
276 | | bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?') |
277 | | |
278 | | quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
279 | | quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?') |
280 | | |
281 | | for line in filecont_source: |
282 | | line = line[:-1] |
283 | | |
284 | | # encode character entities |
285 | | line = string.replace(line, '&', '&') |
286 | | line = string.replace(line, '<', '<') |
287 | | line = string.replace(line, '>', '>') |
288 | | |
289 | | # start entry: publication type (store for later use) |
290 | | if pubtype_rex.match(line): |
291 | | # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
292 | | entrycont = {} |
293 | | entry = [] |
294 | | entrytype = pubtype_rex.sub('\g<1>',line) |
295 | | entrytype = string.lower(entrytype) |
296 | | entryid = pubtype_rex.sub('\g<2>', line) |
297 | | |
298 | | # end entry if just a } |
299 | | elif endtype_rex.match(line): |
300 | | # generate doxygen code for the entry |
301 | | |
302 | | # enty type related formattings |
303 | | if entrytype in ('book', 'inbook'): |
304 | | entrycont['title'] = '<em>' + entrycont['title'] + '</em>' |
305 | | if not entrycont.has_key('author'): |
306 | | entrycont['author'] = entrycont['editor'] |
307 | | entrycont['author']['text'] += ', editors' |
308 | | elif entrytype == 'article': |
309 | | entrycont['journal'] = '<em>' + entrycont['journal'] + '</em>' |
310 | | elif entrytype in ('inproceedings', 'incollection', 'conference'): |
311 | | entrycont['booktitle'] = '<em>' + entrycont['booktitle'] + '</em>' |
312 | | elif entrytype == 'techreport': |
313 | | if not entrycont.has_key('type'): |
314 | | entrycont['type'] = 'Technical report' |
315 | | elif entrytype == 'mastersthesis': |
316 | | entrycont['type'] = 'Master\'s thesis' |
317 | | elif entrytype == 'phdthesis': |
318 | | entrycont['type'] = 'PhD thesis' |
319 | | |
320 | | for eline in entrycont: |
321 | | if eline != '': |
322 | | eline = latexreplacements(eline) |
323 | | |
324 | | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
325 | | entrycont['pages'] = string.replace(entrycont['pages'], '--', '-') |
326 | | |
327 | | if entrycont.has_key('author') and (entrycont['author'] != ''): |
328 | | entry.append(entrycont['author']['text'] + '.') |
329 | | if entrycont.has_key('title') and (entrycont['title'] != ''): |
330 | | entry.append(entrycont['title'] + '.') |
331 | | if entrycont.has_key('journal') and (entrycont['journal'] != ''): |
332 | | entry.append(entrycont['journal'] + ',') |
333 | | if entrycont.has_key('booktitle') and (entrycont['booktitle'] != ''): |
334 | | entry.append('In ' + entrycont['booktitle'] + ',') |
335 | | if entrycont.has_key('type') and (entrycont['type'] != ''): |
336 | | eline = entrycont['type'] |
337 | | if entrycont.has_key('number') and (entrycont['number'] != ''): |
338 | | eline += ' ' + entrycont['number'] |
339 | | eline += ',' |
340 | | entry.append(eline) |
341 | | if entrycont.has_key('institution') and (entrycont['institution'] != ''): |
342 | | entry.append(entrycont['institution'] + ',') |
343 | | if entrycont.has_key('publisher') and (entrycont['publisher'] != ''): |
344 | | entry.append(entrycont['publisher'] + ',') |
345 | | if entrycont.has_key('school') and (entrycont['school'] != ''): |
346 | | entry.append(entrycont['school'] + ',') |
347 | | if entrycont.has_key('address') and (entrycont['address'] != ''): |
348 | | entry.append(entrycont['address'] + ',') |
349 | | if entrycont.has_key('edition') and (entrycont['edition'] != ''): |
350 | | entry.append(entrycont['edition'] + ' edition,') |
351 | | if entrycont.has_key('howpublished') and (entrycont['howpublished'] != ''): |
352 | | entry.append(entrycont['howpublished'] + ',') |
353 | | if entrycont.has_key('volume') and (entrycont['volume'] != ''): |
354 | | eline = entrycont['volume']; |
355 | | if entrycont.has_key('number') and (entrycont['number'] != ''): |
356 | | eline += '(' + entrycont['number'] + ')' |
357 | | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
358 | | eline += ':' + entrycont['pages'] |
359 | | eline += ',' |
360 | | entry.append(eline) |
361 | | else: |
362 | | if entrycont.has_key('pages') and (entrycont['pages'] != ''): |
363 | | entry.append('pages ' + entrycont['pages'] + ',') |
364 | | if entrycont.has_key('year') and (entrycont['year'] != ''): |
365 | | if entrycont.has_key('month') and (entrycont['month'] != ''): |
366 | | entry.append(entrycont['month'] + ' ' + entrycont['year'] + '.') |
367 | | else: |
368 | | entry.append(entrycont['year'] + '.') |
369 | | if entrycont.has_key('note') and (entrycont['note'] != ''): |
370 | | entry.append(entrycont['note'] + '.') |
371 | | if entrycont.has_key('url') and (entrycont['url'] != ''): |
372 | | entry.append(entrycont['url'] + '.') |
373 | | |
374 | | # generate keys for sorting and for the output |
375 | | sortkey = '' |
376 | | bibkey = '' |
377 | | if entrycont.has_key('author'): |
378 | | for author in entrycont['author']['list']: |
379 | | sortkey += copychars(author, author.rfind(' ')+1, len(author)) |
380 | | bibkey = entrycont['author']['abbrev'] |
381 | | else: |
382 | | bibkey = 'x' |
383 | | if entrycont.has_key('year'): |
384 | | sortkey += entrycont['year'] |
385 | | bibkey += entrycont['year'][-2:] |
386 | | if entrycont.has_key('title'): |
387 | | sortkey += entrycont['title'] |
388 | | if entrycont.has_key('key'): |
389 | | sortkey = entrycont['key'] + sortkey |
390 | | bibkey = entrycont['key'] |
391 | | entry.insert(0, sortkey) |
392 | | entry.insert(1, bibkey) |
393 | | entry.insert(2, entryid) |
394 | | |
395 | | # add the entry to the file contents |
396 | | filecont.append(entry) |
397 | | |
398 | | else: |
399 | | # field, publication info |
400 | | field = '' |
401 | | data = '' |
402 | | |
403 | | # field = {data} entries |
404 | | if bracedata_rex.match(line): |
405 | | field = bracefield_rex.sub('\g<1>', line) |
406 | | field = string.lower(field) |
407 | | data = bracedata_rex.sub('\g<2>', line) |
408 | | |
409 | | # field = "data" entries |
410 | | elif quotedata_rex.match(line): |
411 | | field = quotefield_rex.sub('\g<1>', line) |
412 | | field = string.lower(field) |
413 | | data = quotedata_rex.sub('\g<2>', line) |
414 | | |
415 | | # field = data entries |
416 | | elif data_rex.match(line): |
417 | | field = field_rex.sub('\g<1>', line) |
418 | | field = string.lower(field) |
419 | | data = data_rex.sub('\g<2>', line) |
420 | | |
421 | | if field == 'url': |
422 | | data = '\\url{' + data.strip() + '}' |
423 | | |
424 | | if field in ('author', 'editor'): |
425 | | entrycont[field] = bibtexauthor(data) |
426 | | line = '' |
427 | | elif field == 'title': |
428 | | line = bibtextitle(data, entrytype) |
429 | | elif field != '': |
430 | | line = removebraces(transformurls(data.strip())) |
431 | | |
432 | | if line != '': |
433 | | line = latexreplacements(line) |
434 | | entrycont[field] = line |
435 | | |
436 | | |
437 | | # sort entries |
438 | | filecont.sort(entry_cmp) |
439 | | |
440 | | # count the bibtex keys |
441 | | keytable = {} |
442 | | counttable = {} |
443 | | for entry in filecont: |
444 | | bibkey = entry[1] |
445 | | if not keytable.has_key(bibkey): |
446 | | keytable[bibkey] = 1 |
447 | | else: |
448 | | keytable[bibkey] += 1 |
449 | | |
450 | | for bibkey in keytable.keys(): |
451 | | counttable[bibkey] = 0 |
452 | | |
453 | | # generate output |
454 | | for entry in filecont: |
455 | | # generate output key form the bibtex key |
456 | | bibkey = entry[1] |
457 | | entryid = entry[2] |
458 | | if keytable[bibkey] == 1: |
459 | | outkey = bibkey |
460 | | else: |
461 | | outkey = bibkey + chr(97 + counttable[bibkey]) |
462 | | counttable[bibkey] += 1 |
463 | | |
464 | | # append the entry code to the output |
465 | | file.append('\\section ' + entryid + ' [' + outkey + ']') |
466 | | file.append('<div style="' + divstyle + '">') |
467 | | for line in entry[3:]: |
468 | | file.append(line) |
469 | | file.append('</div>') |
470 | | file.append('') |
471 | | |
472 | | return file |
473 | | |
474 | | |
475 | | # |
476 | | # return 1 iff abbr is in line but not inside braces or quotes |
477 | | # assumes that abbr appears only once on the line (out of braces and quotes) |
478 | | # |
479 | | def verify_out_of_braces(line, abbr): |
480 | | |
481 | | phrase_split = delimiter_rex.split(line) |
482 | | |
483 | | abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I) |
484 | | |
485 | | open_brace = 0 |
486 | | open_quote = 0 |
487 | | |
488 | | for phrase in phrase_split: |
489 | | if phrase == "{": |
490 | | open_brace = open_brace + 1 |
491 | | elif phrase == "}": |
492 | | open_brace = open_brace - 1 |
493 | | elif phrase == '"': |
494 | | if open_quote == 1: |
495 | | open_quote = 0 |
496 | | else: |
497 | | open_quote = 1 |
498 | | elif abbr_rex.search(phrase): |
499 | | if open_brace == 0 and open_quote == 0: |
500 | | return 1 |
501 | | |
502 | | return 0 |
503 | | |
504 | | |
505 | | # |
506 | | # a line in the form phrase1 # phrase2 # ... # phrasen |
507 | | # is returned as phrase1 phrase2 ... phrasen |
508 | | # with the correct punctuation |
509 | | # Bug: Doesn't always work with multiple abbreviations plugged in |
510 | | # |
511 | | def concat_line(line): |
512 | | # only look at part after equals |
513 | | field = field_rex.sub('\g<1>',line) |
514 | | rest = field_rex.sub('\g<2>',line) |
515 | | |
516 | | concat_line = field + ' =' |
517 | | |
518 | | pound_split = concatsplit_rex.split(rest) |
519 | | |
520 | | phrase_count = 0 |
521 | | length = len(pound_split) |
522 | | |
523 | | for phrase in pound_split: |
524 | | phrase = phrase.strip() |
525 | | if phrase_count != 0: |
526 | | if phrase.startswith('"') or phrase.startswith('{'): |
527 | | phrase = phrase[1:] |
528 | | elif phrase.startswith('"'): |
529 | | phrase = phrase.replace('"','{',1) |
530 | | |
531 | | if phrase_count != length-1: |
532 | | if phrase.endswith('"') or phrase.endswith('}'): |
533 | | phrase = phrase[:-1] |
534 | | else: |
535 | | if phrase.endswith('"'): |
536 | | phrase = phrase[:-1] |
537 | | phrase = phrase + "}" |
538 | | elif phrase.endswith('",'): |
539 | | phrase = phrase[:-2] |
540 | | phrase = phrase + "}," |
541 | | |
542 | | # if phrase did have \#, add the \# back |
543 | | if phrase.endswith('\\'): |
544 | | phrase = phrase + "#" |
545 | | concat_line = concat_line + ' ' + phrase |
546 | | |
547 | | phrase_count = phrase_count + 1 |
548 | | |
549 | | return concat_line |
550 | | |
551 | | |
552 | | # |
553 | | # substitute abbreviations into filecont |
554 | | # @param filecont_source - string of data from file |
555 | | # |
556 | | def bibtex_replace_abbreviations(filecont_source): |
557 | | filecont = filecont_source.splitlines() |
558 | | |
559 | | # These are defined in bibtex, so we'll define them too |
560 | | abbr_list = ['jan','feb','mar','apr','may','jun', |
561 | | 'jul','aug','sep','oct','nov','dec'] |
562 | | value_list = ['January','February','March','April', |
563 | | 'May','June','July','August','September', |
564 | | 'October','November','December'] |
565 | | |
566 | | abbr_rex = [] |
567 | | total_abbr_count = 0 |
568 | | |
569 | | front = '\\b' |
570 | | back = '(,?)\\b' |
571 | | |
572 | | for x in abbr_list: |
573 | | abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) |
574 | | total_abbr_count = total_abbr_count + 1 |
575 | | |
576 | | |
577 | | abbrdef_rex = re.compile('\s*@string\s*{\s*('+ valid_name_chars +'*)\s*=(.*)', |
578 | | re.I) |
579 | | |
580 | | comment_rex = re.compile('@comment\s*{',re.I) |
581 | | preamble_rex = re.compile('@preamble\s*{',re.I) |
582 | | |
583 | | waiting_for_end_string = 0 |
584 | | i = 0 |
585 | | filecont2 = '' |
586 | | |
587 | | for line in filecont: |
588 | | if line == ' ' or line == '': |
589 | | continue |
590 | | |
591 | | if waiting_for_end_string: |
592 | | if re.search('}',line): |
593 | | waiting_for_end_string = 0 |
594 | | continue |
595 | | |
596 | | if abbrdef_rex.search(line): |
597 | | abbr = abbrdef_rex.sub('\g<1>', line) |
598 | | |
599 | | if abbr_list.count(abbr) == 0: |
600 | | val = abbrdef_rex.sub('\g<2>', line) |
601 | | abbr_list.append(abbr) |
602 | | value_list.append(string.strip(val)) |
603 | | abbr_rex.append( re.compile( front + abbr_list[total_abbr_count] + back, re.I ) ) |
604 | | total_abbr_count = total_abbr_count + 1 |
605 | | waiting_for_end_string = 1 |
606 | | continue |
607 | | |
608 | | if comment_rex.search(line): |
609 | | waiting_for_end_string = 1 |
610 | | continue |
611 | | |
612 | | if preamble_rex.search(line): |
613 | | waiting_for_end_string = 1 |
614 | | continue |
615 | | |
616 | | |
617 | | # replace subsequent abbreviations with the value |
618 | | abbr_count = 0 |
619 | | |
620 | | for x in abbr_list: |
621 | | |
622 | | if abbr_rex[abbr_count].search(line): |
623 | | if verify_out_of_braces(line,abbr_list[abbr_count]) == 1: |
624 | | line = abbr_rex[abbr_count].sub( value_list[abbr_count] + '\g<1>', line) |
625 | | # Check for # concatenations |
626 | | if concatsplit_rex.search(line): |
627 | | line = concat_line(line) |
628 | | abbr_count = abbr_count + 1 |
629 | | |
630 | | |
631 | | filecont2 = filecont2 + line + '\n' |
632 | | i = i+1 |
633 | | |
634 | | |
635 | | # Do one final pass over file |
636 | | |
637 | | # make sure that didn't end up with {" or }" after the substitution |
638 | | filecont2 = filecont2.replace('{"','{{') |
639 | | filecont2 = filecont2.replace('"}','}}') |
640 | | |
641 | | afterquotevalue_rex = re.compile('"\s*,\s*') |
642 | | afterbrace_rex = re.compile('"\s*}') |
643 | | afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*') |
644 | | |
645 | | # add new lines to data that changed because of abbreviation substitutions |
646 | | filecont2 = afterquotevalue_rex.sub('",\n', filecont2) |
647 | | filecont2 = afterbrace_rex.sub('"\n}', filecont2) |
648 | | filecont2 = afterbracevalue_rex.sub('\g<1>},\n', filecont2) |
649 | | |
650 | | return filecont2 |
651 | | |
652 | | # |
653 | | # convert @type( ... ) to @type{ ... } |
654 | | # |
655 | | def no_outer_parens(filecont): |
656 | | |
657 | | # do checking for open parens |
658 | | # will convert to braces |
659 | | paren_split = re.split('([(){}])',filecont) |
660 | | |
661 | | open_paren_count = 0 |
662 | | open_type = 0 |
663 | | look_next = 0 |
664 | | |
665 | | # rebuild filecont |
666 | | filecont = '' |
667 | | |
668 | | at_rex = re.compile('@\w*') |
669 | | |
670 | | for phrase in paren_split: |
671 | | if look_next == 1: |
672 | | if phrase == '(': |
673 | | phrase = '{' |
674 | | open_paren_count = open_paren_count + 1 |
675 | | else: |
676 | | open_type = 0 |
677 | | look_next = 0 |
678 | | |
679 | | if phrase == '(': |
680 | | open_paren_count = open_paren_count + 1 |
681 | | |
682 | | elif phrase == ')': |
683 | | open_paren_count = open_paren_count - 1 |
684 | | if open_type == 1 and open_paren_count == 0: |
685 | | phrase = '}' |
686 | | open_type = 0 |
687 | | |
688 | | elif at_rex.search( phrase ): |
689 | | open_type = 1 |
690 | | look_next = 1 |
691 | | |
692 | | filecont = filecont + phrase |
693 | | |
694 | | return filecont |
695 | | |
696 | | |
697 | | # |
698 | | # make all whitespace into just one space |
699 | | # format the bibtex file into a usable form. |
700 | | # |
701 | | def bibtexwasher(filecont_source): |
702 | | |
703 | | space_rex = re.compile('\s+') |
704 | | comment_rex = re.compile('\s*%') |
705 | | |
706 | | filecont = [] |
707 | | |
708 | | # remove trailing and excessive whitespace |
709 | | # ignore comments |
710 | | for line in filecont_source: |
711 | | line = string.strip(line) |
712 | | line = space_rex.sub(' ', line) |
713 | | # ignore comments |
714 | | if not comment_rex.match(line) and line != '': |
715 | | filecont.append(' '+ line) |
716 | | |
717 | | filecont = string.join(filecont, '') |
718 | | |
719 | | # the file is in one long string |
720 | | |
721 | | filecont = no_outer_parens(filecont) |
722 | | |
723 | | # |
724 | | # split lines according to preferred syntax scheme |
725 | | # |
726 | | filecont = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecont) |
727 | | |
728 | | # add new lines after commas that are after values |
729 | | filecont = re.sub('"\s*,', '",\n', filecont) |
730 | | filecont = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecont) |
731 | | filecont = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,', |
732 | | '\n\n\g<1>\g<2>,\n', filecont) |
733 | | |
734 | | # add new lines after } |
735 | | filecont = re.sub('"\s*}','"\n}\n', filecont) |
736 | | filecont = re.sub('}\s*,','},\n', filecont) |
737 | | |
738 | | |
739 | | filecont = re.sub('@(\w*)', '\n@\g<1>', filecont) |
740 | | |
741 | | # character encoding, reserved latex characters |
742 | | filecont = re.sub('{\\\&}', '&', filecont) |
743 | | filecont = re.sub('\\\&', '&', filecont) |
744 | | |
745 | | # do checking for open braces to get format correct |
746 | | open_brace_count = 0 |
747 | | brace_split = re.split('([{}])',filecont) |
748 | | |
749 | | # rebuild filecont |
750 | | filecont = '' |
751 | | |
752 | | for phrase in brace_split: |
753 | | if phrase == '{': |
754 | | open_brace_count = open_brace_count + 1 |
755 | | elif phrase == '}': |
756 | | open_brace_count = open_brace_count - 1 |
757 | | if open_brace_count == 0: |
758 | | filecont = filecont + '\n' |
759 | | |
760 | | filecont = filecont + phrase |
761 | | |
762 | | filecont2 = bibtex_replace_abbreviations(filecont) |
763 | | |
764 | | # gather |
765 | | filecont = filecont2.splitlines() |
766 | | i=0 |
767 | | j=0 # count the number of blank lines |
768 | | for line in filecont: |
769 | | # ignore blank lines |
770 | | if line == '' or line == ' ': |
771 | | j = j+1 |
772 | | continue |
773 | | filecont[i] = line + '\n' |
774 | | i = i+1 |
775 | | |
776 | | # get rid of the extra stuff at the end of the array |
777 | | # (The extra stuff are duplicates that are in the array because |
778 | | # blank lines were removed.) |
779 | | length = len( filecont) |
780 | | filecont[length-j:length] = [] |
781 | | |
782 | | return filecont |
783 | | |
784 | | |
785 | | def filehandler(filepath): |
786 | | try: |
787 | | fd = open(filepath, 'r') |
788 | | filecont_source = fd.readlines() |
789 | | fd.close() |
790 | | except: |
791 | | print 'Could not open file:', filepath |
792 | | washeddata = bibtexwasher(filecont_source) |
793 | | outdata = bibtexdecoder(washeddata) |
794 | | print '/**' |
795 | | print '\page references References' |
796 | | print |
797 | | for line in outdata: |
798 | | print line |
799 | | print '*/' |
800 | | |
801 | | |
802 | | # main program |
803 | | |
804 | | def main(): |
805 | | import sys |
806 | | if sys.argv[1:]: |
807 | | filepath = sys.argv[1] |
808 | | else: |
809 | | print "No input file" |
810 | | sys.exit() |
811 | | filehandler(filepath) |
812 | | |
813 | | if __name__ == "__main__": main() |
814 | | |
815 | | |
816 | | # end python script |