client = TeiUtils()gdb-utils
TEI/XMLデータを扱う際のツール群です。
Install
pip install git+https://github.com/nakamura196/gdb-utils.git
from gdb_utils.core import *使い方
ダウンロード
url = "https://utda.github.io/goethe/data/xml/goethe.xml"
path = "data/goethe.xml"
client.download(url, path)分析
client.get_tag_freq(path)頻度の降順で表示
client.df| Tag | Count | |
|---|---|---|
| 8 | persName | 15 |
| 55 | lb | 13 |
| 10 | p | 9 |
| 50 | app | 8 |
| 51 | lem | 8 |
| ... | ... | ... |
| 28 | width | 1 |
| 29 | handDesc | 1 |
| 1 | teiHeader | 1 |
| 31 | history | 1 |
| 30 | handNote | 1 |
61 rows × 2 columns
名前の昇順で表示
client.df_tag| Tag | Count | |
|---|---|---|
| 0 | TEI | 1 |
| 18 | addrLine | 1 |
| 17 | address | 1 |
| 50 | app | 8 |
| 5 | author | 2 |
| ... | ... | ... |
| 4 | title | 7 |
| 3 | titleStmt | 1 |
| 41 | variantEncoding | 1 |
| 28 | width | 1 |
| 13 | witness | 2 |
61 rows × 2 columns
RomaでタグをチェックするためのJSスクリプトを生成
print(client.get_javascript())
function checkCheckboxesWithTextValues(textValues) {
// 存在しなかった要素名を格納する配列
let notFound = [];
// 指定されたテキスト値のリストをループ処理
textValues.forEach(function(textToMatch) {
// テキストに一致する .mdc-list-item__primary-text 要素を取得
let found = false;
document.querySelectorAll('.mdc-list-item__primary-text').forEach(function(item) {
if (item.textContent.trim() === textToMatch) {
found = true;
let checkbox = item.closest('.mdc-list-item').querySelector('.mdc-checkbox__native-control');
if (checkbox) {
checkbox.checked = true;
}
}
});
// 要素が見つからなければ notFound 配列に追加
if (!found) {
notFound.push(textToMatch);
}
});
// 存在しなかった要素名を返す
return notFound;
}
// 指定したいテキスト値のリスト
const itemsToCheck = ["TEI", "addrLine", "address", "app", "author", "back", "bibl", "body", "closer", "correspAction", "correspDesc", "country", "date", "dimensions", "district", "div", "editor", "editorialDecl", "encodingDesc", "extent", "fileDesc", "handDesc", "handNote", "height", "history", "idno", "institution", "lb", "lem", "listPerson", "listWit", "location", "msDesc", "msIdentifier", "objectDesc", "opener", "origin", "p", "persName", "person", "physDesc", "placeName", "profileDesc", "provenance", "publicationStmt", "publisher", "rdg", "resp", "respStmt", "salute", "signed", "sourceDesc", "space", "supportDesc", "teiHeader", "text", "title", "titleStmt", "variantEncoding", "width", "witness"];
// チェックしたい項目のリストを関数に渡し、存在しなかった項目を取得
const itemsNotFound = checkCheckboxesWithTextValues(itemsToCheck);
// 存在しなかった項目をコンソールに出力
if (itemsNotFound.length > 0) {
console.log('These items were not found:', itemsNotFound);
} else {
console.log('All items were found and checked.');
}
Splits an XML file into multiple files based on the provided element and attributes.
path = "data/sample.xml"
output = "data/sample/out.xml"
client.split_xml_file(path, output, "pb")[]
[autoreload of gdb_utils.core failed: Traceback (most recent call last):
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 276, in check
superreload(m, reload, self.old_objects)
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
update_generic(old_obj, new_obj)
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
update(a, b)
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 349, in update_class
if update_generic(old_obj, new_obj):
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
update(a, b)
File "/Users/nakamura/git/gdb/gdb-utils/.venv/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 309, in update_function
setattr(old, name, getattr(new, name))
ValueError: startElement() requires a code object with 0 free vars, not 1
]
path = "data/sample.xml"
output = "data/sample2/out.xml"
client.split_xml_file(path, output, "div", split_attr="type", split_value="sample")[]