C：节点移除

Question

关于链表和节点删除的练习，我真的很吃力。

基本上，当一个序列中的每个碱基在相同位置的另一个序列中具有相应的互补序列时，就称 DNA 序列是互补的。碱基A与T键合，碱基C与G键合。例如序列ACGT和TGCA是互补的。

我必须实现一个包含两个链表的函数：第一个是原始 DNA，第二个是编辑序列。该函数必须应用前面描述的编辑方法和 return 新列表。

例如，我们将“A C G T A G A C G T T C T A”作为原始 DNA 序列，将“G C A”作为键合序列。 G 与 C 匹配，C 与 G 匹配，A 与 T 匹配（和 vice-versa）。因此，“C G T”是“G C A”的反射。然后我们必须按照这个确切的顺序从原始DNA序列中删除“C”、“G”和T，这就是下面所述的预期结果。

此外，还有两件事：1. 我必须忽略由于删除任何节点而生成的任何后续匹配项。 2. 除了 stdlib.h 和 malloc() 或 calloc() 等函数，我不能使用任何 headers。我只允许使用 free()。

输入：

A C G T A G A C G T T C T A
G C A

预期结果：

A A G A T C T A

实际结果：

A A G A C G T T C T A

如有任何想法，我们将不胜感激。

谢谢！

Answer 1

我应该根据你现在的代码写一个更好的代码沟通，但我认为不使用递归会更简单。你能试试吗：

#include <stdio.h>
#include <stdlib.h>

typedef struct LinkedNode {
    char data;
    struct LinkedNode *next;
} LinkedNode;

/*
 * append the node with data at the end of the list
 */
void append(LinkedNode *head, char data)
{
    LinkedNode *newNode = malloc(sizeof(LinkedNode));
    newNode->data = data;
    newNode->next = NULL;

    LinkedNode *temp = head;
    while (temp->next != NULL) {
        temp = temp->next;
    }
    temp->next = newNode;
}

/*
 * dump the data of the list
 */
void dump(LinkedNode *head)
{
    for (head = head->next; head != NULL; head = head->next) {
        printf("%c ", head->data);
    }
    printf("\n");
}

/*
 * generate another list of complimentary sequence
 */
void gencomp(LinkedNode *dest, LinkedNode *src)
{
    char s_data, d_data;
    for (src = src->next; src != NULL; src = src->next) {
        s_data = src->data;
        switch(s_data) {
            case 'A':
                d_data = 'T';
                break;
            case 'T':
                d_data = 'A';
                break;
            case 'C':
                d_data = 'G';
                break;
            case 'G':
                d_data = 'C';
                break;
            default:
                fprintf(stderr, "unknown base: %c\n", s_data);
                exit(1);
        }
        append(dest, d_data);
    }
}

/*
 * return the pointer to the last element if the subsequences match
 */
LinkedNode *match(LinkedNode *head, LinkedNode *comp)
{
    for (; head != NULL; head = head->next, comp = comp->next) {
        if (head->data != comp->data) {
            return NULL;
        } else if (comp->next == NULL) {
            return head;
        }
    }
    return NULL;
}

/*
 * search "head" for "comp". If matched, the matched portion is skipped
 * (not freed)
 */
void edit(LinkedNode *head, LinkedNode *comp)
{
    LinkedNode *matched;

    for (; head != NULL; head = head->next) {
        if ((matched = match(head->next, comp->next))) {
            head->next = matched->next;         // skip matched sequence
        }
    }
}

int main()
{
    char dna_orig[] = "ACGTAGACGTTCTA";
    char dna_bond[] = "GCA";
    int i;

    // generate list of original sequence
    LinkedNode *head = malloc(sizeof(LinkedNode));
    head->next = NULL;
    for (i = 0; dna_orig[i] != 0; i++) {
        append(head, dna_orig[i]);
    }

    // generate list of bonding sequence
    LinkedNode *bond = malloc(sizeof(LinkedNode));
    bond->next = NULL;
    for (i = 0; dna_bond[i] != 0; i++) {
        append(bond, dna_bond[i]);
    }

    // generate list of complementary sequence of bond
    LinkedNode *comp = malloc(sizeof(LinkedNode));
    comp->next = NULL;
    gencomp(comp, bond);

    // edit the original sequence and see the result
    edit(head, comp);
    dump(head);

    return 0;
}

输出：

A A G A T C T A

我已经创建了键合序列的互补序列列表方便搜索。
到目前为止，代码效率低下，因为它在列表中使用了重复的线性搜索。
出于打印目的，我必须包含。

[编辑]
释放跳过的节点时，我们不能按前向顺序释放它们从头到尾，因为到下一个节点的link一旦丢失第一个节点被释放。然后我们可以利用 recursion 来释放它们以相反的顺序。这是释放跳过的节点的函数：

void freenodes(LinkedNode *head, LinkedNode *tail)
{
    if (head->next == tail) {                   // the last node to remove
        free(head);
        return;
    }
    freenodes(head->next, tail);                // free nodes recursively
    free(head);
}

函数的第一个参数逐个递增，直到到达匹配序列的最后一个节点。然后函数 returns 给调用者回到匹配序列的第一个节点，同时释放节点。这是更新后的完整代码，包括 edit 函数的修改到 return 已编辑列表的头部。

#include <stdio.h>
#include <stdlib.h>

typedef struct LinkedNode {
    char data;
    struct LinkedNode *next;
} LinkedNode;

/*
 * append the node with data at the end of the list
 */
void append(LinkedNode *head, char data)
{
    LinkedNode *newNode = malloc(sizeof(LinkedNode));
    newNode->data = data;
    newNode->next = NULL;

    LinkedNode *temp = head;
    while (temp->next != NULL) {
        temp = temp->next;
    }
    temp->next = newNode;
}

/*
 * dump the data of the list
 */
void dump(LinkedNode *head)
{
    for (head = head->next; head != NULL; head = head->next) {
        printf("%c ", head->data);
    }
    printf("\n");
}

/*
 * compare characters complementarily
 * return 1 for matching pairs as A<=>T or C<=>G
 */
int compcomp(char a, char b)
{
    if ((a == 'A' && b == 'T') || (a == 'T' && b == 'A')
        || (a == 'C' && b == 'G') || (a == 'G' && b == 'C'))
        return 1;
    else
        return 0;
}

/*
 * return the pointer to the last node if the subsequences match complementarily
 */
LinkedNode *match(LinkedNode *head, LinkedNode *bond)
{
    for (; head != NULL; head = head->next, bond = bond->next) {
        if (! compcomp(head->data, bond->data)) {
            return NULL;
        } else if (bond->next == NULL) {
            return head;
        }
    }
    return NULL;
}

/*
 * free skipped nodes
 */
void freenodes(LinkedNode *head, LinkedNode *tail)
{
    if (head->next == tail) {                   // the last node to remove
        // printf("> %c\n", head->data);        // for debugging
        free(head);
        return;
    }
    freenodes(head->next, tail);                // free nodes recursively
    // printf("> %c\n", head->data);            // for debugging
    free(head);
}

/*
 * search "head" for the sequence of complementary of "bond". If matched,
 * the matched portion is skipped and skipped nodes are freed
 */
LinkedNode *edit(LinkedNode *head, LinkedNode *bond)
{
    LinkedNode *matched;                        // last node of matched sequence
    LinkedNode *matchednext;
    LinkedNode *tmp;                            // copy of head to traverse the list

    for (tmp = head; tmp != NULL; tmp = tmp->next) {
        if ((matched = match(tmp->next, bond->next))) {
            matchednext = matched->next;        // backup matched->next
            freenodes(tmp->next, matched->next);
            tmp->next = matchednext;            // skip matched sequence
        }
    }

    return head;
}

int main()
{
    char dna_orig[] = "ACGTAGACGTTCTA";
    char dna_bond[] = "GCA";
    int i;

    // generate list of original sequence
    LinkedNode *head = malloc(sizeof(LinkedNode));
    head->next = NULL;
    for (i = 0; dna_orig[i] != 0; i++) {
        append(head, dna_orig[i]);
    }

    // generate list of bonding sequence
    LinkedNode *bond = malloc(sizeof(LinkedNode));
    bond->next = NULL;
    for (i = 0; dna_bond[i] != 0; i++) {
        append(bond, dna_bond[i]);
    }

    // edit the original sequence and see the result
    dump(edit(head, bond));

    return 0;
}

顺便说一句，您可以通过删除仅用于报告目的的 printf() 和 fprintf() 函数来省略 #include <stdio.h>。

[EDIT2]
你的代码和我的代码之间的主要区别是数据结构 linked 列表。我列表的 head 有空数据，是入口到第一个节点，而你的 dna_original 直接从节点开始其中包含数据。与 bond 和 sequencia 相同。两者各有优势，但不能混用。那么请修改你的editar_dna为：

LinkedNode *editar_dna(LinkedNode *dna_original, LinkedNode *seq_edicao){
    LinkedNode *matched, *matchednext, *tmp;

    for (tmp = dna_original; tmp != NULL; tmp = tmp->next) {
        if ((matched = match(tmp->next, seq_edicao))) {
            matchednext = matched->next;
            liberar_nos(tmp->next, matched->next);
            tmp->next = matchednext;
        }
    }

    return dna_original;
}

在 match() 函数中将 seq_edicao->next 更改为 seq_edicao。然后它会输出 A A A.
您的数据结构的潜在问题是我们无法删除起始子序列，例如："CGTACGTA"。它在技术上可以修复，但可能需要额外的大量修改。

现在你有三个选择：

忽略以匹配子序列开头的边缘情况。
修改linked列表结构（简单但我不确定是否可以接受。）
找到保持当前数据结构的对策。

由你决定。 :)

[EDIT3]
这是将选项 3（w/o main 的修改）应用于您的代码（仅供参考）的版本：

#include <stdio.h>
#include <stdlib.h>

typedef struct LinkedNode LinkedNode;
struct LinkedNode {
   char data;
   LinkedNode *next;
};

void imprimir1(LinkedNode *inicio){
    if(inicio == NULL){
    return;
    }

    printf("%c", inicio->data);
    if (inicio->next!=NULL) printf(" ");
    else printf("\n");
    imprimir1(inicio->next);
    return;
}

void liberar_lista(LinkedNode *inicio){
    if(inicio == NULL) return;
    liberar_lista(inicio->next);
    free(inicio);
}

LinkedNode *inserir_final_r(LinkedNode *inicio, char valor) {
    if (inicio == NULL) {
        LinkedNode *novo = malloc(sizeof(LinkedNode));
        if (novo == NULL) return inicio;
        novo->data = valor;
        novo->next = NULL;
        return novo;
    }
    inicio->next = inserir_final_r(inicio->next, valor);
    return inicio;
}

void liberar_nos(LinkedNode *head, LinkedNode *tail) {
    if (head == tail) {
        return;
    }
    liberar_nos(head->next, tail);
    free(head);
}

int compare(char a, char b) {
    if ((a == 'A' && b == 'T') || (a == 'T' && b == 'A')
        || (a == 'C' && b == 'G') || (a == 'G' && b == 'C'))
        return 1;
    else
        return 0;
}

LinkedNode *match(LinkedNode *head, LinkedNode *bond) {
    for (; head != NULL; head = head->next, bond = bond->next) {
        if (! compare(head->data, bond->data)) {
            return NULL;
        } else if (bond->next == NULL) {
            return head;
        }
    }
    return NULL;
}

LinkedNode *editar_dna(LinkedNode *dna_original, LinkedNode *seq_edicao){
    LinkedNode *matched, *matchednext, *tmp;

    // remove leading matched subsequence(s) as a preproces
    while ((matched = match(dna_original, seq_edicao))) {
        matchednext = matched->next;
        liberar_nos(dna_original->next, matched->next); // note we cannot free the 1st node
        dna_original->data = matchednext->data;
        dna_original->next = matchednext->next;
        liberar_nos(matchednext, matchednext->next);    // free the copied node which is no longer used
    }
/*
    for (tmp = dna_original; tmp != NULL; tmp = tmp->next) {
        if ((matched = match(tmp->next, seq_edicao))) {
            matchednext = matched->next;
            liberar_nos(tmp->next, matched->next);
            tmp->next = matchednext;
        }
    }
*/
    for (tmp = dna_original; tmp != NULL; ) {
        if ((matched = match(tmp->next, seq_edicao))) {
            matchednext = matched->next;
            liberar_nos(tmp->next, matched->next);
            tmp->next = matchednext;                    // skip matched sequence
        } else {
            tmp = tmp->next;                            // proceed to next node
        }
    }

    return dna_original;
}

int main(){

    //Creating empty nodes
    LinkedNode *dna_original = NULL;
    LinkedNode *sequencia = NULL;

    //Populating test data into the original sequence
//    dna_original = inserir_final_r(dna_original, 'A');        // dropped just for the test
    dna_original = inserir_final_r(dna_original, 'C');
    dna_original = inserir_final_r(dna_original, 'G');
    dna_original = inserir_final_r(dna_original, 'T');
    dna_original = inserir_final_r(dna_original, 'C');
    dna_original = inserir_final_r(dna_original, 'G');
    dna_original = inserir_final_r(dna_original, 'T');
    dna_original = inserir_final_r(dna_original, 'A');
    dna_original = inserir_final_r(dna_original, 'C');
    dna_original = inserir_final_r(dna_original, 'G');
    dna_original = inserir_final_r(dna_original, 'T');
    dna_original = inserir_final_r(dna_original, 'A');

    //Populating test data into the second sequence
    sequencia = inserir_final_r(sequencia, 'G');
    sequencia = inserir_final_r(sequencia, 'C');
    sequencia = inserir_final_r(sequencia, 'A');

    //Printing the original sequence before change
    imprimir1(dna_original);

    //Changing the sequence
    editar_dna(dna_original, sequencia);

    //Printing the original sequence after change
    imprimir1(dna_original);

    //Freeing allocated memory
    liberar_lista(dna_original);
    liberar_lista(sequencia);

    return 0;
}

请注意，为了调试和演示目的修改了 main() 中列表的初始化。

Answer 2

根据标题和讨论，我假设应用编辑应该通过删除匹配的子序列来改变输入列表。如果列表的第一个节点被删除，您将需要 return 指向后面某个节点的指针。否则第一个节点是 returned.

处理这两种情况的简单方法是进行特定检查。但是，有一个 well-known 模式可以将这两种情况作为一个来处理。它是 pre-pending 暂时的“虚拟”头节点，永远不会被删除。我不会尝试进一步解释，而是附加代码。参见 apply_edit。

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>

typedef struct node_s {
  struct node_s *next;
  char base;
} Node;

void print(Node *seq) {
  for (Node *p = seq; p; p = p->next) printf("%c", p->base);
  printf("\n");
}

bool base_match(char x, char y) {
  switch (x) {
    case 'A': return y == 'T';
    case 'C': return y == 'G';
    case 'G': return y == 'C';
    case 'T': return y == 'A';
  }
  exit(42); // input error
}

// If the prefix of seq is as given, return the following seq node, else seq.
Node *remove_prefix(Node *prefix, Node *seq) {
  Node *p, *s;
  for (p = prefix, s = seq; p && s; p = p->next, s = s->next)
    if (!base_match(p->base, s->base))
      return seq;
  return p ? seq : s;
}

Node *apply_edit(Node *edit, Node *seq) {
  Node head[1] = {{ seq }};
  for (Node *p = head; p && p->next; ) {
    Node *next = remove_prefix(edit, p->next);
    if (next == p->next) p = next;  // No match; advance one.
    else p->next = next;  // Remove the match; don't advance.
  }
  return head->next;
}

Node *build_seq(char *gene) {
  Node *seq = NULL;
  for (int i = strlen(gene) - 1; i >= 0; --i) {
    Node *node = malloc(sizeof *node);
    node->base = gene[i];
    node->next = seq;
    seq = node;
  }
  return seq;
}

int main(void) {
  // Provided test case. Expect AAGATCTA.
  print(apply_edit(build_seq("GCA"), build_seq("ACGTAGACGTTCTA")));
  // Remove from head. Expect A.
  print(apply_edit(build_seq("GCA"), build_seq("CGTA")));
  // Remove from tail. Expect A.
  print(apply_edit(build_seq("GCA"), build_seq("ACGT")));
  // Incomplete match at tail. Expect ACG
  print(apply_edit(build_seq("GCA"), build_seq("ACG")));
  // Remove all. Expect empty string.
  print(apply_edit(build_seq("GCA"), build_seq("CGTCGT")));
  // Remove creates new match, which is ignored. Expect CGT.
  print(apply_edit(build_seq("GCA"), build_seq("CCGTGT")));

  return 0;
}

尽管这会使代码更难阅读，但您可以在 apply_edit 中调用辅助函数的地方内联它，并获得非常简洁的内容：

Node *apply_edit(Node *edit, Node *seq) {
  Node *e, *s, head[1] = {{ seq }};
  for (Node *p = head; p && p->next; ) {
    for (e = edit, s = p->next; e && s; e = e->next, s = s->next)
      if (!base_match(e->base, s->base)) break;
    if (e) p = s;  // No match; advance one.
    else p->next = s;  // Remove the match; don't advance.
  }
  return head->next;
}

我还要指出，您不会在生产代码中使用递归遍历列表。虽然一个带有优化的好编译器会将尾递归（像你的）转换为迭代，但它可能是一个挑剔的优化......因为编译器会在 non-obvious 情况下跳过它。如果发生这种情况，您的程序需要堆栈 space 与您正在遍历的列表的长度成比例，而迭代代码将运行在一个堆栈帧中正常。如果您的程序需要可靠，那就很重要了。

C：节点移除

C: Node Removal

c

recursion

linked-list