= TeiUtils() client
gdb-utils
TEI/XMLデータを扱う際のツール群です。
Install
pip install git+https://github.com/nakamura196/gdb-utils.git
from gdb_utils.core import *
使い方
ダウンロード
= "https://utda.github.io/goethe/data/xml/goethe.xml"
url = "data/goethe.xml"
path client.download(url, path)
分析
client.get_tag_freq(path)
頻度の降順で表示
client.df
Tag | Count | |
---|---|---|
8 | persName | 15 |
55 | lb | 13 |
10 | p | 9 |
50 | app | 8 |
51 | lem | 8 |
... | ... | ... |
28 | width | 1 |
29 | handDesc | 1 |
1 | teiHeader | 1 |
31 | history | 1 |
30 | handNote | 1 |
61 rows × 2 columns
名前の昇順で表示
client.df_tag
Tag | Count | |
---|---|---|
0 | TEI | 1 |
18 | addrLine | 1 |
17 | address | 1 |
50 | app | 8 |
5 | author | 2 |
... | ... | ... |
4 | title | 7 |
3 | titleStmt | 1 |
41 | variantEncoding | 1 |
28 | width | 1 |
13 | witness | 2 |
61 rows × 2 columns
RomaでタグをチェックするためのJSスクリプトを生成
print(client.get_javascript())
function checkCheckboxesWithTextValues(textValues) {
// 存在しなかった要素名を格納する配列
let notFound = [];
// 指定されたテキスト値のリストをループ処理
textValues.forEach(function(textToMatch) {
// テキストに一致する .mdc-list-item__primary-text 要素を取得
let found = false;
document.querySelectorAll('.mdc-list-item__primary-text').forEach(function(item) {
if (item.textContent.trim() === textToMatch) {
found = true;
let checkbox = item.closest('.mdc-list-item').querySelector('.mdc-checkbox__native-control');
if (checkbox) {
checkbox.checked = true;
}
}
});
// 要素が見つからなければ notFound 配列に追加
if (!found) {
notFound.push(textToMatch);
}
});
// 存在しなかった要素名を返す
return notFound;
}
// 指定したいテキスト値のリスト
const itemsToCheck = ["TEI", "addrLine", "address", "app", "author", "back", "bibl", "body", "closer", "correspAction", "correspDesc", "country", "date", "dimensions", "district", "div", "editor", "editorialDecl", "encodingDesc", "extent", "fileDesc", "handDesc", "handNote", "height", "history", "idno", "institution", "lb", "lem", "listPerson", "listWit", "location", "msDesc", "msIdentifier", "objectDesc", "opener", "origin", "p", "persName", "person", "physDesc", "placeName", "profileDesc", "provenance", "publicationStmt", "publisher", "rdg", "resp", "respStmt", "salute", "signed", "sourceDesc", "space", "supportDesc", "teiHeader", "text", "title", "titleStmt", "variantEncoding", "width", "witness"];
// チェックしたい項目のリストを関数に渡し、存在しなかった項目を取得
const itemsNotFound = checkCheckboxesWithTextValues(itemsToCheck);
// 存在しなかった項目をコンソールに出力
if (itemsNotFound.length > 0) {
console.log('These items were not found:', itemsNotFound);
} else {
console.log('All items were found and checked.');
}
Splits an XML file into multiple files based on the provided element and attributes.
= "data/sample.xml"
path = "data/sample/out.xml"
output "pb") client.split_xml_file(path, output,
[]
[autoreload of gdb_utils.core failed: Traceback (most recent call last):
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 276, in check
superreload(m, reload, self.old_objects)
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
update_generic(old_obj, new_obj)
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
update(a, b)
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 349, in update_class
if update_generic(old_obj, new_obj):
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
update(a, b)
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 309, in update_function
setattr(old, name, getattr(new, name))
ValueError: startElement() requires a code object with 0 free vars, not 1
]
= "data/sample.xml"
path = "data/sample2/out.xml"
output "div", split_attr="type", split_value="sample") client.split_xml_file(path, output,
[]