4

Using R Studio 1.0.44 on a MacOS 10.12.1

This is my first time working with xml files and I'm having quite a bit of trouble.. I want to turn my xml file into an r data frame that looks something like this:

Date    househouldid    memberid    session begin   end
11/1/15 4FDYT14A1Q             1      235   49448   49453
11/1/15 4FDYT14A1Q             1    1010001 49453   49463
11/1/15 4FDYT14A1Q             1    1010475 49463   49468

I only want Date, householdid, memberid, session, begin and end in the data frame.

I tried...

xmldataframe<-xmlToDataFrame("DQA_audxml_20150105_EN.xml")
print(xmldataframe)

and got this...

destinations members regular_guests occasional_guests tvsets household_process

Here's part of the xml file (I'm restricted by the character count)..

<?xml version="1.0" encoding="UTF-8"?>
<households date="2015-01-05" creation_date="2015-01-11T04:22:35" xmlns:s="http://www.mediametrie.fr/nge/schemas" xmlns:xalan="http://xml.apache.org/xalan">
  <household id="95D7N14AFA">
    <destinations />
    <members>
      <member id="1">
        <member_process result="OK" vacation="undefined">
          <individual_audience />
          <alarms />
        </member_process>
      </member>
      <member id="2">
        <member_process result="OK" vacation="undefined">
          <individual_audience />
          <alarms />
        </member_process>
      </member>
      <member id="3">
        <member_process result="OK" vacation="undefined">
          <individual_audience />
          <alarms />
        </member_process>
      </member>
    </members>
    <regular_guests />
    <occasional_guests />
    <tvsets>
      <tvset id="1">
        <tvset_process result="OK">
          <tvset_audience />
          <alarms>
            <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING">
              <parameters>
                <parameter name="unique_id" value="149" />
                <parameter name="reboot_date" value="2015-01-05T03:34:23" />
                <parameter name="length" value="0.35116898148148146" />
              </parameters>
            </alarm>
            <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING">
              <parameters>
                <parameter name="unique_id" value="150" />
                <parameter name="reboot_date" value="2015-01-05T05:28:34" />
                <parameter name="length" value="0.02806712962962963" />
              </parameters>
            </alarm>
            <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING">
              <parameters>
                <parameter name="unique_id" value="151" />
                <parameter name="reboot_date" value="2015-01-05T08:22:46" />
                <parameter name="length" value="0.07322916666666666" />
              </parameters>
            </alarm>
            <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING">
              <parameters>
                <parameter name="unique_id" value="152" />
                <parameter name="reboot_date" value="2015-01-05T14:32:43" />
                <parameter name="length" value="0.004803240740740742" />
              </parameters>
            </alarm>
            <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING">
              <parameters>
                <parameter name="unique_id" value="153" />
                <parameter name="reboot_date" value="2015-01-05T18:30:47" />
                <parameter name="length" value="0.0012384259259259258" />
              </parameters>
            </alarm>
            <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING">
              <parameters>
                <parameter name="unique_id" value="154" />
                <parameter name="reboot_date" value="2015-01-06T02:52:01" />
                <parameter name="length" value="0.33817129629629633" />
              </parameters>
            </alarm>
          </alarms>
        </tvset_process>
      </tvset>
    </tvsets>
    <household_process result="OK" vacation="no">
      <alarms />
    </household_process>
  </household>
  <household id="ISDPN14ASG">
    <destinations />
    <members>
      <member id="1">
        <member_process result="OK" vacation="undefined">
          <individual_audience />
          <alarms />
        </member_process>
      </member>
      <member id="2">
        <member_process result="OK" vacation="undefined">
          <individual_audience />
          <alarms />
        </member_process>
      </member>
      <member id="3">
        <member_process result="OK" vacation="undefined">
          <individual_audience />
          <alarms />
        </member_process>
      </member>
      <member id="4">
        <member_process result="OK" vacation="undefined">
          <individual_audience />
          <alarms />
        </member_process>
      </member>
      <member id="5">
        <member_process result="OK" vacation="undefined">
          <individual_audience />
          <alarms />
        </member_process>
      </member>
    </members>
    <regular_guests />
    <occasional_guests />
    <tvsets>
      <tvset id="1">
        <tvset_process result="KO">
          <tvset_audience />
          <alarms>
            <alarm id="AL_R_P_EVENT_CHRONOLOGY" rule_id="R_P_EVENT_CHRONOLOGY">
              <parameters>
                <parameter name="unique_id" value="16195" />
              </parameters>
            </alarm>
            <alarm id="AL_PROCESS_STOP" rule_id="R_P_EVENT_CHRONOLOGY">
              <parameters />
            </alarm>
          </alarms>
        </tvset_process>
      </tvset>
    </tvsets>
    <household_process result="KO" vacation="no">
      <alarms>
        <alarm id="AL_T_FP_HOUSEHOLD_ABS_DECLARATION" rule_id="T_FP_HOUSEHOLD_ABS_DECLARATION">
          <parameters />
        </alarm>
        <alarm id="AL_R_FP_NO_TVSET_KO" rule_id="R_FP_NO_TVSET_KO">
          <parameters>
            <parameter name="tvset_id" value="1" />
          </parameters>
        </alarm>
        <alarm id="AL_R_FP_AT_LEAST_ONE_TVSET_OK" rule_id="R_FP_AT_LEAST_ONE_TVSET_OK">
          <parameters />
        </alarm>
      </alarms>
    </household_process>
  </household>
  <household id="4FDYT14A1Q">
    <destinations />
    <members>
      <member id="1">
        <member_process result="OK" vacation="undefined">
          <individual_audience />
          <alarms />
        </member_process>
      </member>
      <member id="2">
        <member_process result="OK" vacation="undefined">
          <individual_audience>
            <individual_audience_tvset id="1">
              <channel session="1010497" begin="46722" end="46738" />
              <channel session="48" begin="46738" end="46743" />
              <channel session="235" begin="46743" end="46748" />
              <channel session="47" begin="46748" end="46753" />
              <channel session="1010383" begin="46753" end="46973" />
              <channel session="1010497" begin="46973" end="46983" />
              <channel session="1010319" begin="46983" end="46993" />
              <channel session="1010072" begin="46993" end="47578" />
              <channel session="1010499" begin="47578" end="47618" />
              <channel session="47" begin="47618" end="47633" />
              <channel session="1010383" begin="47633" end="47693" />
              <channel session="48" begin="47693" end="47703" />
              <channel session="235" begin="47703" end="47728" />
              <channel session="1010469" begin="47728" end="47778" />
              <channel session="235" begin="47778" end="47783" />
              <channel session="48" begin="47783" end="47828" />
              <channel session="1010367" begin="47828" end="47853" />
              <channel session="48" begin="47853" end="48373" />
              <channel session="1010383" begin="48373" end="48573" />
              <channel session="48" begin="48573" end="49433" />
              <channel session="1010452" begin="49433" end="49448" />
              <channel session="235" begin="49448" end="49453" />
              <channel session="1010001" begin="49453" end="49463" />
              <channel session="1010475" begin="49463" end="49468" />
              <channel session="1010383" begin="49468" end="49483" />
              <channel session="48" begin="49483" end="49508" />
              <channel session="1010403" begin="49508" end="49543" />
              <channel session="48" begin="49543" end="53839" />
              <channel session="1010354" begin="60868" end="60878" />
              <channel session="235" begin="60878" end="60893" />
              <channel session="48" begin="60893" end="60908" />
              <channel session="235" begin="60908" end="60913" />
              <channel session="1010072" begin="60913" end="60928" />
              <channel session="48" begin="60928" end="60933" />
              <channel session="235" begin="60933" end="60938" />
              <channel session="48" begin="60938" end="60943" />
              <channel session="235" begin="60943" end="60953" />
              <channel session="48" begin="60953" end="60968" />
              <channel session="235" begin="60968" end="60993" />
              <channel session="1010072" begin="60993" end="61483" />
              <channel session="1010499" begin="61483" end="61538" />
              <channel session="48" begin="61538" end="61543" />
              <channel session="47" begin="61543" end="61578" />
              <channel session="1010383" begin="61578" end="61603" />
              <channel session="1010072" begin="61603" end="63428" />
              <channel session="1010497" begin="63428" end="63438" />
              <channel session="1010319" begin="63438" end="63888" />
              <channel session="1010072" begin="63888" end="63898" />
              <channel session="235" begin="63898" end="63903" />
              <channel session="47" begin="63903" end="64368" />
              <channel session="48" begin="64368" end="64373" />
              <channel session="47" begin="64373" end="64378" />
              <channel session="1010072" begin="64378" end="64393" />
              <channel session="1010383" begin="64393" end="64408" />
              <channel session="1010001" begin="64408" end="64423" />
              <channel session="235" begin="64423" end="64428" />
              <channel session="1010469" begin="64428" end="64468" />
              <channel session="1010452" begin="64468" end="64473" />
              <channel session="48" begin="64473" end="64528" />
              <channel session="1010367" begin="64528" end="65053" />
              <channel session="48" begin="65053" end="65328" />
              <channel session="1010367" begin="65328" end="66613" />
              <channel session="1010383" begin="66613" end="66628" />
              <channel session="1010497" begin="66628" end="66823" />
              <channel session="47" begin="66823" end="66833" />
              <channel session="1010367" begin="66833" end="67538" />
              <channel session="1010383" begin="67538" end="67553" />
              <channel session="1010354" begin="67553" end="67563" />
              <channel session="1010497" begin="67563" end="67658" />
              <channel session="1010499" begin="67658" end="67713" />
              <channel session="1010072" begin="67713" end="67733" />
              <channel session="1010367" begin="67733" end="67803" />
              <channel session="48" begin="67803" end="68018" />
              <channel session="1010367" begin="68018" end="70963" />
              <channel session="1010383" begin="70963" end="71048" />
              <channel session="1010367" begin="71048" end="71848" />
              <channel session="1010383" begin="71848" end="72023" />
              <channel session="1010497" begin="72023" end="72043" />
              <channel session="1010072" begin="72043" end="72068" />
              <channel session="1010499" begin="72068" end="72083" />
              <channel session="47" begin="72083" end="72098" />
              <channel session="1010499" begin="72098" end="73643" />
              <channel session="1010072" begin="73643" end="73668" />
              <channel session="1010354" begin="73668" end="73683" />
              <channel session="1010383" begin="73683" end="73693" />
              <channel session="1010499" begin="73693" end="73708" />
              <channel session="1010072" begin="73708" end="73748" />
              <channel session="235" begin="73748" end="73753" />
              <channel session="1010497" begin="73753" end="73763" />
              <channel session="1010499" begin="73763" end="74323" />
              <channel session="1010497" begin="74323" end="74363" />
              <channel session="1010354" begin="74363" end="74378" />
              <channel session="1010499" begin="74378" end="74443" />
              <channel session="1010072" begin="74443" end="74453" />
              <channel session="1010499" begin="74453" end="74883" />
              <channel session="1010497" begin="74883" end="74938" />
              <channel session="1010072" begin="74938" end="74953" />
              <channel session="47" begin="74953" end="75603" />
              <channel session="1010497" begin="75603" end="77673" />
              <channel session="1010383" begin="77673" end="79208" />
              <channel session="1010354" begin="79208" end="79228" />
              <channel session="235" begin="79228" end="79233" />
              <channel session="1010497" begin="79233" end="79603" />
              <channel session="1010072" begin="79603" end="80088" />
              <channel session="1010497" begin="80088" end="80238" />
              <channel session="1010499" begin="80238" end="80253" />
              <channel session="235" begin="80253" end="80258" />
              <channel session="47" begin="80258" end="80423" />
              <channel session="1010072" begin="80423" end="80678" />
              <channel session="1010497" begin="80678" end="80688" />
              <channel session="47" begin="80688" end="80853" />
              <channel session="235" begin="80853" end="80858" />
              <channel session="1010354" begin="80858" end="80868" />
              <channel session="1010497" begin="80868" end="81428" />
              <channel session="1010072" begin="81428" end="81438" />
              <channel session="235" begin="81438" end="81448" />
              <channel session="47" begin="81448" end="81453" />
              <channel session="1010497" begin="81453" end="82108" />
              <channel session="1010354" begin="82108" end="82528" />
              <channel session="1010497" begin="82528" end="82538" />
              <channel session="1010072" begin="82538" end="82548" />
              <channel session="1010499" begin="82548" end="82608" />
              <channel session="47" begin="82608" end="82628" />
              <channel session="1010072" begin="82628" end="82648" />
              <channel session="1010354" begin="82648" end="82973" />
              <channel session="1010072" begin="82973" end="82983" />
              <channel session="1010499" begin="82983" end="83273" />
              <channel session="47" begin="83273" end="83313" />
              <channel session="1010383" begin="83313" end="83393" />
              <channel session="1010319" begin="83393" end="83443" />
              <channel session="1010499" begin="83443" end="83498" />
              <channel session="48" begin="83498" end="83688" />
              <channel session="47" begin="83688" end="83883" />
              <channel session="48" begin="83883" end="83898" />
              <channel session="1010258" begin="83898" end="84193" />
              <channel session="47" begin="84193" end="84198" />
              <channel session="1010260" begin="84198" end="84253" />
              <channel session="235" begin="84253" end="84258" />
              <channel session="47" begin="84258" end="84273" />
              <channel session="48" begin="84273" end="84423" />
              <channel session="235" begin="84423" end="84433" />
              <channel session="48" begin="84433" end="84448" />
              <channel session="1010260" begin="84448" end="84463" />
              <channel session="1010144" begin="84463" end="84578" />
              <channel session="235" begin="84578" end="84583" />
              <channel session="1010260" begin="84583" end="84628" />
              <channel session="235" begin="84628" end="84633" />
              <channel session="1010258" begin="84633" end="84643" />
              <channel session="48" begin="84643" end="85088" />
              <channel session="235" begin="85088" end="85098" />
              <channel session="1010258" begin="85098" end="85113" />
              <channel session="235" begin="85113" end="85118" />
              <channel session="1010144" begin="85118" end="85163" />
              <channel session="1010260" begin="85163" end="85178" />
              <channel session="48" begin="85178" end="85398" />
              <channel session="1010493" begin="85398" end="85418" />
              <channel session="48" begin="85418" end="85428" />
              <channel session="235" begin="85428" end="85433" />
              <channel session="47" begin="85433" end="85488" />
              <channel session="1010260" begin="85488" end="85493" />
              <channel session="1010258" begin="85493" end="85638" />
              <channel session="1010474" begin="85638" end="85785" />
            </individual_audience_tvset>
          </individual_audience>
          <alarms />
        </member_process>
      </member>
      <member id="3">
        <member_process result="OK" vacation="undefined">
          <individual_audience>
            <individual_audience_tvset id="1">
              <channel session="1010493" begin="54476" end="54490" />
              <channel session="47" begin="54490" end="55035" />
              <channel session="1010379" begin="55035" end="55210" />
              <channel session="48" begin="55210" end="55220" />
              <channel session="1010397" begin="55220" end="55570" />
              <channel session="235" begin="55570" end="55580" />
              <channel session="1010425" begin="55580" end="55585" />
              <channel session="1010397" begin="55585" end="55620" />
              <channel session="235" begin="55620" end="55625" />
              <channel session="48" begin="55625" end="56875" />
              <channel session="1010379" begin="56875" end="57345" />
              <channel session="48" begin="57345" end="57615" />
              <channel session="1010379" begin="57615" end="57865" />
              <channel session="48" begin="57865" end="58079" />
            </individual_audience_tvset>
          </individual_audience>
          <alarms />
        </member_process>
      </member>
      <member id="4">
        <member_process result="OK" vacation="undefined">
          <individual_audience />
          <alarms />
        </member_process>
      </member>
    </members>
    <regular_guests />
    <occasional_guests />
    <tvsets>
      <tvset id="1">
        <tvset_process result="OK">
          <tvset_audience>
            <channel session="1010497" begin="46708" end="46738" />
            <channel session="48" begin="46738" end="46743" />
            <channel session="235" begin="46743" end="46748" />
            <channel session="47" begin="46748" end="46753" />
            <channel session="1010383" begin="46753" end="46973" />
            <channel session="1010497" begin="46973" end="46983" />
            <channel session="1010319" begin="46983" end="46993" />
            <channel session="1010072" begin="46993" end="47578" />
            <channel session="1010499" begin="47578" end="47618" />
            <channel session="47" begin="47618" end="47633" />
            <channel session="1010383" begin="47633" end="47693" />
            <channel session="48" begin="47693" end="47703" />
            <channel session="235" begin="47703" end="47728" />
            <channel session="1010469" begin="47728" end="47778" />
            <channel session="235" begin="47778" end="47783" />
            <channel session="48" begin="47783" end="47828" />
            <channel session="1010367" begin="47828" end="47853" />
            <channel session="48" begin="47853" end="48373" />
            <channel session="1010383" begin="48373" end="48573" />
            <channel session="48" begin="48573" end="49433" />
            <channel session="1010452" begin="49433" end="49448" />
            <channel session="235" begin="49448" end="49453" />
            <channel session="1010001" begin="49453" end="49463" />
            <channel session="1010475" begin="49463" end="49468" />
            <channel session="1010383" begin="49468" end="49483" />
            <channel session="48" begin="49483" end="49508" />
            <channel session="1010403" begin="49508" end="49543" />
            <channel session="48" begin="49543" end="53839" />
            <channel session="1010493" begin="54460" end="54490" />
            <channel session="47" begin="54490" end="55035" />
            <channel session="1010379" begin="55035" end="55210" />
            <channel session="48" begin="55210" end="55220" />
            <channel session="1010397" begin="55220" end="55570" />
            <channel session="235" begin="55570" end="55580" />
            <channel session="1010425" begin="55580" end="55585" />
            <channel session="1010397" begin="55585" end="55620" />
            <channel session="235" begin="55620" end="55625" />
            <channel session="48" begin="55625" end="56875" />
            <channel session="1010379" begin="56875" end="57345" />
            <channel session="48" begin="57345" end="57615" />
            <channel session="1010379" begin="57615" end="57865" />
            <channel session="48" begin="57865" end="58079" />
            <channel session="1010354" begin="60848" end="60878" />
            <channel session="235" begin="60878" end="60893" />
            <channel session="48" begin="60893" end="60908" />
            <channel session="235" begin="60908" end="60913" />
            <channel session="1010072" begin="60913" end="60928" />
            <channel session="48" begin="60928" end="60933" />
            <channel session="235" begin="60933" end="60938" />
            <channel session="48" begin="60938" end="60943" />
            <channel session="235" begin="60943" end="60953" />
            <channel session="48" begin="60953" end="60968" />
            <channel session="235" begin="60968" end="60993" />
            <channel session="1010072" begin="60993" end="61483" />
            <channel session="1010499" begin="61483" end="61538" />
            <channel session="48" begin="61538" end="61543" />
            <channel session="47" begin="61543" end="61578" />
            <channel session="1010383" begin="61578" end="61603" />
            <channel session="1010072" begin="61603" end="63428" />
            <channel session="1010497" begin="63428" end="63438" />
            <channel session="1010319" begin="63438" end="63888" />
            <channel session="1010072" begin="63888" end="63898" />
            <channel session="235" begin="63898" end="63903" />
            <channel session="47" begin="63903" end="64368" />
            <channel session="48" begin="64368" end="64373" />
            <channel session="47" begin="64373" end="64378" />
            <channel session="1010072" begin="64378" end="64393" />
            <channel session="1010383" begin="64393" end="64408" />
            <channel session="1010001" begin="64408" end="64423" />
            <channel session="235" begin="64423" end="64428" />
            <channel session="1010469" begin="64428" end="64468" />
            <channel session="1010452" begin="64468" end="64473" />
            <channel session="48" begin="64473" end="64528" />
            <channel session="1010367" begin="64528" end="65053" />
            <channel session="48" begin="65053" end="65328" />
            <channel session="1010367" begin="65328" end="66613" />
            <channel session="1010383" begin="66613" end="66628" />
            <channel session="1010497" begin="66628" end="66823" />
            <channel session="47" begin="66823" end="66833" />
            <channel session="1010367" begin="66833" end="67538" />
            <channel session="1010383" begin="67538" end="67553" />
            <channel session="1010354" begin="67553" end="67563" />
            <channel session="1010497" begin="67563" end="67658" />
            <channel session="1010499" begin="67658" end="67713" />
            <channel session="1010072" begin="67713" end="67733" />
            <channel session="1010367" begin="67733" end="67803" />
            <channel session="48" begin="67803" end="68018" />
            <channel session="1010367" begin="68018" end="70963" />
            <channel session="1010383" begin="70963" end="71048" />
            <channel session="1010367" begin="71048" end="71848" />
            <channel session="1010383" begin="71848" end="72023" />
            <channel session="1010497" begin="72023" end="72043" />
            <channel session="1010072" begin="72043" end="72068" />
            <channel session="1010499" begin="72068" end="72083" />
            <channel session="47" begin="72083" end="72098" />
            <channel session="1010499" begin="72098" end="73643" />
            <channel session="1010072" begin="73643" end="73668" />
            <channel session="1010354" begin="73668" end="73683" />
            <channel session="1010383" begin="73683" end="73693" />
            <channel session="1010499" begin="73693" end="73708" />
            <channel session="1010072" begin="73708" end="73748" />
            <channel session="235" begin="73748" end="73753" />
            <channel session="1010497" begin="73753" end="73763" />
            <channel session="1010499" begin="73763" end="74323" />
            <channel session="1010497" begin="74323" end="74363" />
            <channel session="1010354" begin="74363" end="74378" />
            <channel session="1010499" begin="74378" end="74443" />
            <channel session="1010072" begin="74443" end="74453" />
            <channel session="1010499" begin="74453" end="74883" />
            <channel session="1010497" begin="74883" end="74938" />
            <channel session="1010072" begin="74938" end="74953" />
            <channel session="47" begin="74953" end="75603" />
            <channel session="1010497" begin="75603" end="77673" />
            <channel session="1010383" begin="77673" end="79208" />
            <channel session="1010354" begin="79208" end="79228" />
            <channel session="235" begin="79228" end="79233" />
            <channel session="1010497" begin="79233" end="79603" />
            <channel session="1010072" begin="79603" end="80088" />
            <channel session="1010497" begin="80088" end="80238" />
            <channel session="1010499" begin="80238" end="80253" />
            <channel session="235" begin="80253" end="80258" />
2
  • 1
    This is very similar to stackoverflow.com/questions/40787171/… . Using the solution marked (2) there instead of looping on //test, loop on //channel getting its session, begin and end attributes. Reach back through its ancestors to get the id attribute of its member node ancestor , the id attribute of its household node ancestor and date attribute of its households node ancestor. Alternately with a bit more work you could follow the faster solution marked (1). Commented Nov 30, 2016 at 0:24
  • Thanks Grothendieck! Commented Nov 30, 2016 at 18:58

2 Answers 2

2

This is all just elbow grease and being familiar with how XML is structured. The relatively new purrr package is very useful here. But, I think your question is still a bit vague and the lack of a valid XML file is more than problematic:

library(xml2)
library(purrr)
library(dplyr)

hh <- read_xml("~/data/so.xml")

xml_find_all(hh, ".//household") %>% 
  map_df(function(x) {
    id <- xml_attr(x, "id")
    members <- xml_find_all(x, ".//member")
    map_df(members, function(y) {
      channel_sessions <- xml_find_all(y, ".//channel")
      if (length(channel_sessions) == 0) {
        NULL
      } else {
        xml_attrs(channel_sessions) %>% 
          map_df(as.list) %>% 
          mutate(memberid=xml_attr(y, "id"),
                 household=id)
      }
    })
  }) %>% 
  mutate(date=xml_find_first(hh, "//households") %>% 
           xml_attr("creation_date") %>% 
           as.Date())
## # A tibble: 176 × 6
##    session begin   end memberid  household       date
##      <chr> <chr> <chr>    <chr>      <chr>     <date>
## 1  1010497 46722 46738        2 4FDYT14A1Q 2015-01-11
## 2       48 46738 46743        2 4FDYT14A1Q 2015-01-11
## 3      235 46743 46748        2 4FDYT14A1Q 2015-01-11
## 4       47 46748 46753        2 4FDYT14A1Q 2015-01-11
## 5  1010383 46753 46973        2 4FDYT14A1Q 2015-01-11
## 6  1010497 46973 46983        2 4FDYT14A1Q 2015-01-11
## 7  1010319 46983 46993        2 4FDYT14A1Q 2015-01-11
## 8  1010072 46993 47578        2 4FDYT14A1Q 2015-01-11
## 9  1010499 47578 47618        2 4FDYT14A1Q 2015-01-11
## 10      47 47618 47633        2 4FDYT14A1Q 2015-01-11
## # ... with 166 more rows
Sign up to request clarification or add additional context in comments.

Comments

0

With words of transform and xml in same title, consider XSLT, the special-purpose language designed to transform source XML files into end use needs such as import into R dataframes. Now the challenge presented is R does not have a dedicated library to run XSLT.

Nonetheless, R can call command lines to external processers (even in other languages like Python, PHP, Java, etc.) to convert input to output. Below are two versions: for Windows (using Powershell's call to XslCompiledTransform) and Linux/Mac (using xsltproc usually installed on most distributions):

XSLT Script (save as .xsl, a well-formed xml file)

<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
 <xsl:output version="1.0" encoding="UTF-8" indent="yes" />
 <xsl:strip-space elements="*"/>

  <xsl:template match="/households">   
   <xsl:copy>
     <xsl:apply-templates select="household"/>
   </xsl:copy>
  </xsl:template>

  <xsl:template match="household|members|member|member_process|individual_audience|individual_audience_tvset">
    <xsl:apply-templates/>
  </xsl:template>

  <xsl:template match="channel">
    <tvset>
      <date><xsl:value-of select="ancestor::households/@date"/></date>
      <household_id><xsl:value-of select="ancestor::household/@id"/></household_id>
      <member_id><xsl:value-of select="ancestor::member/@id"/></member_id>
      <session><xsl:value-of select="@session"/></session>
      <begin><xsl:value-of select="@begin"/></begin>
      <end><xsl:value-of select="@end"/></end>
    </tvset>    
  </xsl:template>

  <xsl:template match="tvsets"/>   
</xsl:transform>

PowerShell Script (save as .ps1 file, for Windows only)

param ($xml, $xsl, $output)

if (-not $xml -or -not $xsl -or -not $output) {
    Write-Host "& .\xslt.ps1 [-xml] xml-input [-xsl] xsl-input [-output] transform-output"
    exit;
}

trap [Exception]{
    Write-Host $_.Exception;
}

$xslt = New-Object System.Xml.Xsl.XslCompiledTransform;
$xslt.Load($xsl);
$xslt.Transform($xml, $output);

Write-Host "generated" $output;

R (Windows)

library(XML)

system(paste0('Powershell.exe -File',
              ' "C:\\Path\\To\\PowerShell\\Script.ps1"',
              ' "C:\\Path\\To\\Input.xml"',
              ' "C:\\Path\\To\\XSLT\\Script.xsl"', 
              ' "C:\\Path\\To\\Output.xml"'))

df <- xmlToDataFrame("C:\\Path\\To\\Output.xml")

head(df, 10)
#          date household_id member_id session begin   end
# 1  2015-01-05   4FDYT14A1Q         2 1010497 46722 46738
# 2  2015-01-05   4FDYT14A1Q         2      48 46738 46743
# 3  2015-01-05   4FDYT14A1Q         2     235 46743 46748
# 4  2015-01-05   4FDYT14A1Q         2      47 46748 46753
# 5  2015-01-05   4FDYT14A1Q         2 1010383 46753 46973
# 6  2015-01-05   4FDYT14A1Q         2 1010497 46973 46983
# 7  2015-01-05   4FDYT14A1Q         2 1010319 46983 46993
# 8  2015-01-05   4FDYT14A1Q         2 1010072 46993 47578
# 9  2015-01-05   4FDYT14A1Q         2 1010499 47578 47618
# 10 2015-01-05   4FDYT14A1Q         2      47 47618 47633

R (Linux/Mac)

library(XML)

system('xsltproc /path/to/XSLT_Script.xsl /path/to/Input.xml -o path/to/Output.xml')

df <- xmlToDataFrame("path/to/Output.xml")

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.