• 카테고리

    질문 & 답변
  • 세부 분야

    프로그래밍 언어

  • 해결 여부

    미해결

XML Read -> JSON write 숙제 제출

18.11.26 23:04 작성 조회수 86

0

낙서장 같은 코드..
이번 강의도 감사합니다!
질문. BeautifulSoup 모듈을 사용할 때는 항상 아래처럼 bs4와 함께 import해야 하나요?
from bs4 import BeautifulSoup

import os
import re
import json
from bs4 import BeautifulSoup

# SEPARATE XML=================================================================
# setting
raw_xml_file = "ipa110106.XML"
target_root = "data"

# read raw xml
with open(raw_xml_file, "r") as raw_xml:
    xml_contents = raw_xml.read()

# separate xml contents
chopped_txts = re.findall(r'(<\?xml)([\s\S]+?)(</us-patent-application>)', xml_contents)
chopped_txts = [''.join(tuples) for tuples in chopped_txts]

# check dir
if not os.path.isdir(target_root):
    os.mkdir(target_root)

# make new files with separated xml contents
for txt in chopped_txts:
    new_file_name = re.findall(r'(<us-patent-application.+file=")(.+XML)', txt)
    with open(os.path.join(target_root, new_file_name[0][1]), "w") as new_xml:
        new_xml.write(txt)

# MAKE DICTIONARY==============================================================
# setting
data_dict = dict()
source_root = "data"
source_xml_files = os.listdir(source_root)

# make Dictionary Data from Xml
for xml_file in source_xml_files:

    with open(os.path.join(source_root, xml_file), "r") as xml_file:
        # xml tags
        soup = BeautifulSoup(xml_file, "lxml")
        publication_reference_tag = soup.find("publication-reference")
        application_reference_tag = soup.find("application-reference")
        p_document_id_tag = publication_reference_tag.find("document-id")
        a_document_id_tag = application_reference_tag.find("document-id")

        patent_dict = dict()  # reset

        # extract info from Xml
        p_country = p_document_id_tag.find("country").get_text()  # 등록국가
        p_doc_number = p_document_id_tag.find("doc-number").get_text()  # 등록번호
        p_kind = p_document_id_tag.find("kind").get_text()  # 등록상태
        p_date = p_document_id_tag.find("date").get_text()  # 등록일자
        a_country = a_document_id_tag.find("country").get_text()  # 출원국가
        a_doc_number = a_document_id_tag.find("doc-number").get_text()  # 출원번호
        a_date = a_document_id_tag.find("date").get_text()  # 출원일
        invention_title = soup.find("invention-title").get_text()  # 특허제목

        # store info in Patent Dict
        patent_dict["publication-country"] = p_country
        patent_dict["publication-number"] = p_doc_number
        patent_dict["publication-kind"] = p_kind
        patent_dict["publication-date"] = p_date
        patent_dict["application-country"] = a_country
        patent_dict["application-number"] = a_doc_number
        patent_dict["application-date"] = a_date
        patent_dict["invention-title"] = invention_title

        # add patent info in data dict
        # key value is publication-doc-number
        data_dict[p_doc_number] = patent_dict

# EXPORT TO JSON===============================================================
# setting
output_root = "output"
output_file = "my_first_json.json"

# check dir
if not os.path.isdir(output_root):
    os.mkdir(output_root)
# (over)write on json file
with open(os.path.join(output_root, output_file), "w") as json_file:
    json.dump(data_dict, json_file)

답변 0

답변을 작성해보세요.

답변을 기다리고 있는 질문이에요.
첫번째 답변을 남겨보세요!