-
카테고리
-
세부 분야
프로그래밍 언어
-
해결 여부
미해결
XML Read -> JSON write 숙제 제출
18.11.26 23:04 작성 조회수 86
0
낙서장 같은 코드..
이번 강의도 감사합니다!
질문. BeautifulSoup 모듈을 사용할 때는 항상 아래처럼 bs4와 함께 import해야 하나요?
from bs4 import BeautifulSoup
import os
import re
import json
from bs4 import BeautifulSoup
# SEPARATE XML=================================================================
# setting
raw_xml_file = "ipa110106.XML"
target_root = "data"
# read raw xml
with open(raw_xml_file, "r") as raw_xml:
xml_contents = raw_xml.read()
# separate xml contents
chopped_txts = re.findall(r'(<\?xml)([\s\S]+?)(</us-patent-application>)', xml_contents)
chopped_txts = [''.join(tuples) for tuples in chopped_txts]
# check dir
if not os.path.isdir(target_root):
os.mkdir(target_root)
# make new files with separated xml contents
for txt in chopped_txts:
new_file_name = re.findall(r'(<us-patent-application.+file=")(.+XML)', txt)
with open(os.path.join(target_root, new_file_name[0][1]), "w") as new_xml:
new_xml.write(txt)
# MAKE DICTIONARY==============================================================
# setting
data_dict = dict()
source_root = "data"
source_xml_files = os.listdir(source_root)
# make Dictionary Data from Xml
for xml_file in source_xml_files:
with open(os.path.join(source_root, xml_file), "r") as xml_file:
# xml tags
soup = BeautifulSoup(xml_file, "lxml")
publication_reference_tag = soup.find("publication-reference")
application_reference_tag = soup.find("application-reference")
p_document_id_tag = publication_reference_tag.find("document-id")
a_document_id_tag = application_reference_tag.find("document-id")
patent_dict = dict() # reset
# extract info from Xml
p_country = p_document_id_tag.find("country").get_text() # 등록국가
p_doc_number = p_document_id_tag.find("doc-number").get_text() # 등록번호
p_kind = p_document_id_tag.find("kind").get_text() # 등록상태
p_date = p_document_id_tag.find("date").get_text() # 등록일자
a_country = a_document_id_tag.find("country").get_text() # 출원국가
a_doc_number = a_document_id_tag.find("doc-number").get_text() # 출원번호
a_date = a_document_id_tag.find("date").get_text() # 출원일
invention_title = soup.find("invention-title").get_text() # 특허제목
# store info in Patent Dict
patent_dict["publication-country"] = p_country
patent_dict["publication-number"] = p_doc_number
patent_dict["publication-kind"] = p_kind
patent_dict["publication-date"] = p_date
patent_dict["application-country"] = a_country
patent_dict["application-number"] = a_doc_number
patent_dict["application-date"] = a_date
patent_dict["invention-title"] = invention_title
# add patent info in data dict
# key value is publication-doc-number
data_dict[p_doc_number] = patent_dict
# EXPORT TO JSON===============================================================
# setting
output_root = "output"
output_file = "my_first_json.json"
# check dir
if not os.path.isdir(output_root):
os.mkdir(output_root)
# (over)write on json file
with open(os.path.join(output_root, output_file), "w") as json_file:
json.dump(data_dict, json_file)
답변을 작성해보세요.
답변 0