Yannick DAYER · a0ad81de
--- a/How-to-port-old-bob-database-interface-to-the-new-csv-format.md
+++ b/How-to-port-old-bob-database-interface-to-the-new-csv-format.md
@@ -103,5 +103,125 @@ if __name__ == "__main__":
    convert_database()
 ```

+## Adding custom metadata to the CSV files
+
+The following custom script creates the CSV protocol definition files for the _replay-mobile_ image dataset from a bob 8 database. This dataset needs some metadata fields that are added in the CSV files:
+
+- `should_flip` indicates if the data in the sample's image needs to be mirrored horizontally;
+- `attack_type` indicates if a probe is an attack, and what type of attack is applied.
+
+These fields are constructed from the filename of each sample and the type of file (model or probe) in the `add_metadata` function.
+
+This has to be run in a bob 8 environment.
+
+```python
+from bob.bio.face.database.replaymobile import ReplayMobileBioDatabase
+import os
+import tarfile
+from csv import DictWriter
+
+# Exceptions to the flip rule: some 'tablet' file need flipping:
+FORCE_FLIP_IDS = ["26", "27"]
+
+def convert_replaymobile_to_csv():
+    database = ReplayMobileBioDatabase()
+
+    # replaymobile-img has every protocol duplicated ('X-licit' and 'X-spoof')
+    all_protocols = database._db.protocol_names()
+    all_protocols = [p.replace("-licit", "") for p in all_protocols]
+    all_protocols = [p.replace("-spoof", "") for p in all_protocols]
+    all_protocols = list(set(all_protocols))
+
+    for protocol in all_protocols:
+        # Retrieve the file lists from the legacy db
+        train_files = database.objects(groups=["world"], protocol=protocol+"-licit", purposes=["enroll"])
+        dev_enroll = database.objects(groups=["dev"], protocol=protocol+"-licit", purposes=["enroll"])
+        dev_probe_licit = database.objects(groups=["dev"], protocol=protocol+"-licit", purposes=["probe"])
+        dev_probe_spoof = database.objects(groups=["dev"], protocol=protocol+"-spoof", purposes=["probe"])
+        eval_enroll = database.objects(groups=["eval"], protocol=protocol+"-licit", purposes=["enroll"])
+        eval_probe_licit = database.objects(groups=["eval"], protocol=protocol+"-licit", purposes=["probe"])
+        eval_probe_spoof = database.objects(groups=["eval"], protocol=protocol+"-spoof", purposes=["probe"])
+
+        # Check that the lists are not empty
+        has_eval, has_train = True, True
+        if not all([eval_enroll, eval_probe_licit, eval_probe_spoof]):
+            has_eval = False
+        if not train_files:
+            has_train = False
+
+        def add_metadata(list_of_files, attack=False):
+            """Adds metadata fields to each file in the list"""
+            for f in list_of_files:
+                f.frame = int(f.path[-3:])
+                f.path = f.path[:-4]
+                split_path = f.path.split('_')
+                if not attack: # Genuine files have one filename format
+                    f.reference_id = int(split_path[-5][-3:])
+                    f.purpose = split_path[-3]
+                else: # Attack files have a different filename format
+                    f.reference_id = int(split_path[-7][-3:])
+                    f.capturing_device = split_path[-3]
+                    f.purpose = "attack"
+                    f.attack_type = "spoof"
+                f.capturing_device = split_path[-2]
+                f.should_flip = f.capturing_device == "mobile" or f.id.split('_')[0] in FORCE_FLIP_IDS
+
+        # Add the metadata to each file in each list
+        add_metadata(dev_enroll)
+        add_metadata(dev_probe_licit)
+        add_metadata(dev_probe_spoof, True)
+        if has_eval:
+            add_metadata(eval_enroll)
+            add_metadata(eval_probe_licit)
+            add_metadata(eval_probe_spoof, True)
+        if has_train:
+            add_metadata(train_files)
+
+        # Create the folder structure
+        protocol_path = os.path.join("replaymobile-img", protocol)
+        dev_path = os.path.join(protocol_path, "dev")
+        os.makedirs(dev_path, exist_ok=True)
+        if has_eval:
+            eval_path = os.path.join(protocol_path, "eval")
+            os.makedirs(eval_path, exist_ok=True)
+        if has_train:
+            train_path = os.path.join(protocol_path, "norm")
+            os.makedirs(train_path, exist_ok=True)
+
+        # Writing the CSV files
+        def write_to_csv(path, filelist, header, fields):
+            with open(path, "w") as f:
+                csv_writer = DictWriter(f, delimiter=',', fieldnames=header)
+                csv_writer.writeheader()
+                csv_writer.writerows([{k:v for k,v in zip(header, [getattr(s, a, None) for a in fields])} for s in filelist])
+
+        # Columns in the csv (header)
+        csv_fields = ["PATH", "REFERENCE_ID", "ID", "FRAME", "PURPOSE", "SHOULD_FLIP"]
+        # Corresponding fields in the File objects
+        file_attr = ["path", "reference_id", "id", "frame", "purpose", "should_flip"]
+
+        # Probe header have some special metadata/columns
+        csv_fields_probes = csv_fields + ["ATTACK_TYPE", ]
+        file_attr_probes = file_attr + ["attack_type", ]
+
+        write_to_csv(os.path.join(dev_path, "for_models.csv"), dev_enroll, csv_fields, file_attr)
+        write_to_csv(os.path.join(dev_path, "for_probes.csv"), dev_probe_licit+dev_probe_spoof, csv_fields_probes, file_attr_probes)
+        if has_eval:
+            write_to_csv(os.path.join(eval_path, "for_models.csv"), eval_enroll, csv_fields, file_attr)
+            write_to_csv(os.path.join(eval_path, "for_probes.csv"), eval_probe_licit+eval_probe_spoof, csv_fields_probes, file_attr_probes)
+        if has_train:
+            write_to_csv(os.path.join(train_path, "train_world.csv"), train_files, csv_fields, file_attr)
+
+    # Create the final tarball
+    path = f"bio-face-replaymobile-img.tar.gz"
+    with tarfile.open(path, "w:gz") as tar:
+        tar.add("replaymobile-img", arcname=".")
+
+    print(f"Created '{path}'.")
+
+if __name__ == "__main__":
+    convert_replaymobile_to_csv()
+```
+
 # Documentation
 Document the protocols, data format, metadata
\ No newline at end of file