Add files using upload-large-folder tool

Browse files

Files changed (4) hide show

.gitattributes +1 -0
assets/head.png +3 -0
data/agilex/hdf5totfrecords.py +114 -0
data/empty_lang_embed.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/head.png filter=lfs diff=lfs merge=lfs -text

assets/head.png ADDED Viewed

Git LFS Details

SHA256: c8f735a5ff1eccb080256f9756aecab43c933cb4f3ea35b499618c9bcb64a9ec
Pointer size: 131 Bytes
Size of remote file: 743 kB

data/agilex/hdf5totfrecords.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import tensorflow as tf
+import h5py
+import os
+import fnmatch
+import shutil
+from tqdm import tqdm
+from multiprocessing import Pool
+import numpy as np
+def _bytes_feature(value):
+    """Returns a bytes_list from a string / byte."""
+    if isinstance(value, type(tf.constant(0))):
+        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+def _bool_feature(value):
+    """Returns a bool_list from a boolean."""
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(value)]))
+def serialize_example(action, base_action, qpos, qvel, cam_high, cam_left_wrist, cam_right_wrist, instruction, terminate_episode):
+    feature = {
+        'action': _bytes_feature(tf.io.serialize_tensor(action)),
+        'base_action': _bytes_feature(tf.io.serialize_tensor(base_action)),
+        'qpos': _bytes_feature(tf.io.serialize_tensor(qpos)),
+        'qvel': _bytes_feature(tf.io.serialize_tensor(qvel)),
+        'cam_high': _bytes_feature(tf.io.serialize_tensor(tf.convert_to_tensor(cam_high.tobytes(), dtype=tf.string))),
+        'cam_left_wrist': _bytes_feature(tf.io.serialize_tensor(tf.convert_to_tensor(cam_left_wrist.tobytes(), dtype=tf.string))),
+        'cam_right_wrist': _bytes_feature(tf.io.serialize_tensor(tf.convert_to_tensor(cam_right_wrist.tobytes(), dtype=tf.string))),
+        'instruction': _bytes_feature(instruction),
+        'terminate_episode': _bool_feature(terminate_episode)
+    }
+    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
+    return example_proto.SerializeToString()
+def process_hdf5_file(args):
+    filepath, root_dir, out_dir = args
+    output_dir = os.path.join(out_dir, os.path.relpath(os.path.dirname(filepath), root_dir))
+    os.makedirs(output_dir, exist_ok=True)
+    filename = os.path.basename(filepath)
+    tfrecord_path = os.path.join(output_dir, filename.replace('.hdf5', '.tfrecord'))
+    if os.path.exists(tfrecord_path) and os.path.getsize(tfrecord_path) > 0:
+        return f"TFRecords already exist at {tfrecord_path}"
+    try:
+        with h5py.File(filepath, 'r') as f, tf.io.TFRecordWriter(tfrecord_path) as writer:
+            num_episodes = f['action'].shape[0]
+            # Remove the first few still steps
+            EPS = 1e-2
+            qpos = f['observations']['qpos'][:]
+            # Get the idx of the first qpos whose delta exceeds the threshold
+            qpos_delta = np.abs(qpos - qpos[0:1])
+            indices = np.where(np.any(qpos_delta > EPS, axis=1))[0]
+            if len(indices) > 0:
+                first_idx = indices[0]
+            else:
+                raise ValueError("Found no qpos that exceeds the threshold.")
+            for i in range(first_idx-1, num_episodes):
+                action = f['action'][i]
+                base_action = f['base_action'][i]
+                qpos = f['observations']['qpos'][i]
+                qvel = f['observations']['qvel'][i]
+                cam_high = f['observations']['images']['cam_high'][i]
+                cam_left_wrist = f['observations']['images']['cam_left_wrist'][i]
+                cam_right_wrist = f['observations']['images']['cam_right_wrist'][i]
+                instruction  = f['instruction'][()]
+                terminate_episode = i == num_episodes - 1
+                serialized_example = serialize_example(action, base_action, qpos, qvel, cam_high, cam_left_wrist, cam_right_wrist, instruction, terminate_episode)
+                writer.write(serialized_example)
+    except Exception as e:
+        with open("error_log.txt", "a") as f:
+            f.write(f"{filepath}\n")
+        print(f"error at {filepath}: {e}")
+    return f"TFRecords written to {tfrecord_path}"
+def write_tfrecords(root_dir, out_dir):
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    hdf5_files = []
+    for root, dirs, files in os.walk(root_dir):
+        if os.path.exists(os.path.join(root,"expanded_instruction_gpt-4-turbo.json")):
+            # copy the instruction file
+            target_path = os.path.join(out_dir, os.path.relpath(root, root_dir))
+            os.makedirs(target_path, exist_ok=True)
+            shutil.copy(os.path.join(root,"expanded_instruction_gpt-4-turbo.json"), target_path)
+        elif os.path.exists(os.path.join(root,"expanded_instruction.json")):
+            print(root)
+            target_path = os.path.join(out_dir, os.path.relpath(root, root_dir))
+            os.makedirs(target_path, exist_ok=True)
+            shutil.copy(os.path.join(root,"expanded_instruction.json"), target_path)
+            # rename into expanded_instruction_gpt-4-turbo.json
+            os.rename(os.path.join(out_dir, os.path.relpath(root, root_dir), "expanded_instruction.json"), os.path.join(out_dir, os.path.relpath(root, root_dir), "expanded_instruction_gpt-4-turbo.json"))
+        for filename in fnmatch.filter(files, '*.hdf5'):
+            filepath = os.path.join(root, filename)
+            hdf5_files.append((filepath, root_dir, out_dir))
+    with Pool(16) as pool:
+        max_count = len(hdf5_files)
+        with tqdm(total=max_count) as pbar:
+            for _ in pool.imap_unordered(process_hdf5_file, hdf5_files):
+                pbar.update(1)
+    print(f"TFRecords written to {out_dir}")
+root_dir = "../datasets/agilex/rdt_data/"
+out_dir = "../datasets/agilex/tfrecords/"
+write_tfrecords(root_dir, out_dir)

data/empty_lang_embed.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b073685d3b8627ac068e7907f4d53e1b831729fd34e01e05ed96ebe53bf19633
+size 9432