1 /* 2 * Copyright 2006-2007, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 #include "EndpointManager.h" 14 15 #include <net_buffer.h> 16 #include <net_datalink.h> 17 #include <net_stat.h> 18 #include <NetBufferUtilities.h> 19 #include <NetUtilities.h> 20 21 #include <lock.h> 22 #include <util/AutoLock.h> 23 #include <util/khash.h> 24 #include <util/list.h> 25 26 #include <KernelExport.h> 27 28 #include <netinet/in.h> 29 #include <netinet/ip.h> 30 #include <netinet/tcp.h> 31 #include <new> 32 #include <stdlib.h> 33 #include <string.h> 34 35 36 // References: 37 // - RFC 793 - Transmission Control Protocol 38 // - RFC 813 - Window and Acknowledgement Strategy in TCP 39 // 40 // Things this implementation currently doesn't implement: 41 // 42 // TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, RFC 2001, RFC 2581, RFC 3042 43 // NewReno Modification to TCP's Fast Recovery, RFC 2582 44 // Explicit Congestion Notification (ECN), RFC 3168 45 // SYN-Cache 46 // TCP Extensions for High Performance, RFC 1323 47 // SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 48 // Forward RTO-Recovery, RFC 4138 49 50 #define PrintAddress(address) \ 51 AddressString(Domain(), address, true).Data() 52 53 //#define TRACE_TCP 54 //#define PROBE_TCP 55 56 #ifdef TRACE_TCP 57 // the space before ', ##args' is important in order for this to work with cpp 2.95 58 # define TRACE(format, args...) dprintf("TCP [%llu] %p (%12s) " format "\n", \ 59 system_time(), this, name_for_state(fState) , ##args) 60 #else 61 # define TRACE(args...) do { } while (0) 62 #endif 63 64 #ifdef PROBE_TCP 65 # define PROBE(buffer, window) \ 66 dprintf("TCP PROBE %llu %s %s %ld %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu\n", \ 67 system_time(), PrintAddress(buffer->source), \ 68 PrintAddress(buffer->destination), buffer->size, (uint32)fSendNext, \ 69 (uint32)fSendUnacknowledged, fCongestionWindow, fSlowStartThreshold, \ 70 window, fSendWindow, (uint32)(fSendMax - fSendUnacknowledged), \ 71 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 72 #else 73 # define PROBE(buffer, window) do { } while (0) 74 #endif 75 76 // Initial estimate for packet round trip time (RTT) 77 #define TCP_INITIAL_RTT 2000000 78 79 // constants for the fFlags field 80 enum { 81 FLAG_OPTION_WINDOW_SCALE = 0x01, 82 FLAG_OPTION_TIMESTAMP = 0x02, 83 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 84 // That is, what is expected from accept() after a shutdown() 85 // is performed on a listen()ing socket. 86 FLAG_NO_RECEIVE = 0x04, 87 }; 88 89 90 static const int kTimestampFactor = 1024; 91 92 93 static inline bigtime_t 94 absolute_timeout(bigtime_t timeout) 95 { 96 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 97 return timeout; 98 99 return timeout + system_time(); 100 } 101 102 103 static inline status_t 104 posix_error(status_t error) 105 { 106 if (error == B_TIMED_OUT) 107 return B_WOULD_BLOCK; 108 109 return error; 110 } 111 112 113 static inline bool 114 in_window(const tcp_sequence &sequence, const tcp_sequence &rcvNext, 115 uint32 rcvWindow) 116 { 117 return sequence >= rcvNext && sequence < (rcvNext + rcvWindow); 118 } 119 120 121 static inline bool 122 segment_in_sequence(const tcp_segment_header &segment, int size, 123 const tcp_sequence &rcvNext, uint32 rcvWindow) 124 { 125 tcp_sequence sequence(segment.sequence); 126 if (size == 0) { 127 if (rcvWindow == 0) 128 return sequence == rcvNext; 129 return in_window(sequence, rcvNext, rcvWindow); 130 } else { 131 if (rcvWindow == 0) 132 return false; 133 return in_window(sequence, rcvNext, rcvWindow) 134 || in_window(sequence + size - 1, rcvNext, rcvWindow); 135 } 136 } 137 138 139 static inline bool 140 is_writable(tcp_state state) 141 { 142 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED 143 || state == ESTABLISHED || state == FINISH_RECEIVED; 144 } 145 146 147 static inline uint32 tcp_now() 148 { 149 return system_time() / kTimestampFactor; 150 } 151 152 153 static inline uint32 tcp_diff_timestamp(uint32 base) 154 { 155 uint32 now = tcp_now(); 156 157 if (now > base) 158 return now - base; 159 160 return now + UINT_MAX - base; 161 } 162 163 164 static inline bool 165 state_needs_finish(int32 state) 166 { 167 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 168 || state == FINISH_SENT || state == CLOSING; 169 } 170 171 172 WaitList::WaitList(const char *name) 173 { 174 fCondition = 0; 175 fSem = create_sem(0, name); 176 } 177 178 179 WaitList::~WaitList() 180 { 181 delete_sem(fSem); 182 } 183 184 185 status_t 186 WaitList::InitCheck() const 187 { 188 return fSem; 189 } 190 191 192 status_t 193 WaitList::Wait(MutexLocker &locker, bigtime_t timeout, bool wakeNext) 194 { 195 locker.Unlock(); 196 197 status_t status = B_OK; 198 199 while (status == B_OK && !atomic_test_and_set(&fCondition, 0, 1)) 200 status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, 201 timeout); 202 203 locker.Lock(); 204 if (status == B_OK && wakeNext) 205 Signal(); 206 207 return status; 208 } 209 210 211 void 212 WaitList::Signal() 213 { 214 atomic_or(&fCondition, 1); 215 release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_IF_WAITING_ONLY); 216 } 217 218 219 TCPEndpoint::TCPEndpoint(net_socket *socket) 220 : 221 ProtocolSocket(socket), 222 fManager(NULL), 223 fReceiveList("tcp receive"), 224 fSendList("tcp send"), 225 fOptions(0), 226 fSendWindowShift(0), 227 fReceiveWindowShift(0), 228 fSendUnacknowledged(0), 229 fSendNext(0), 230 fSendMax(0), 231 fSendWindow(0), 232 fSendMaxWindow(0), 233 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 234 fSendQueue(socket->send.buffer_size), 235 fInitialSendSequence(0), 236 fDuplicateAcknowledgeCount(0), 237 fRoute(NULL), 238 fReceiveNext(0), 239 fReceiveMaxAdvertised(0), 240 fReceiveWindow(socket->receive.buffer_size), 241 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 242 fReceiveQueue(socket->receive.buffer_size), 243 fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor), 244 fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor), 245 fRetransmitTimeout(TCP_INITIAL_RTT), 246 fReceivedTimestamp(0), 247 fCongestionWindow(0), 248 fSlowStartThreshold(0), 249 fState(CLOSED), 250 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP), 251 fError(B_OK) 252 { 253 //gStackModule->init_timer(&fTimer, _TimeWait, this); 254 255 // TODO: to be replaced with a real locking strategy! 256 mutex_init(&fLock, "tcp lock"); 257 258 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 259 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, this); 260 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 261 TCPEndpoint::_DelayedAcknowledgeTimer, this); 262 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, this); 263 } 264 265 266 TCPEndpoint::~TCPEndpoint() 267 { 268 mutex_lock(&fLock); 269 270 gStackModule->cancel_timer(&fRetransmitTimer); 271 gStackModule->cancel_timer(&fPersistTimer); 272 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 273 gStackModule->cancel_timer(&fTimeWaitTimer); 274 275 if (fManager) { 276 fManager->Unbind(this); 277 return_endpoint_manager(fManager); 278 } 279 280 mutex_destroy(&fLock); 281 } 282 283 284 status_t 285 TCPEndpoint::InitCheck() const 286 { 287 if (fLock.sem < B_OK) 288 return fLock.sem; 289 290 if (fReceiveList.InitCheck() < B_OK) 291 return fReceiveList.InitCheck(); 292 293 if (fSendList.InitCheck() < B_OK) 294 return fSendList.InitCheck(); 295 296 return B_OK; 297 } 298 299 300 // #pragma mark - protocol API 301 302 303 status_t 304 TCPEndpoint::Open() 305 { 306 TRACE("Open()"); 307 308 status_t status = ProtocolSocket::Open(); 309 if (status < B_OK) 310 return status; 311 312 fManager = create_endpoint_manager(Domain()); 313 if (fManager == NULL) 314 return EAFNOSUPPORT; 315 316 return B_OK; 317 } 318 319 320 status_t 321 TCPEndpoint::Close() 322 { 323 TRACE("Close()"); 324 325 MutexLocker lock(fLock); 326 327 if (fState == LISTEN) 328 delete_sem(fAcceptSemaphore); 329 330 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 331 fState = CLOSED; 332 return B_OK; 333 } 334 335 status_t status = _ShutdownEgress(true); 336 if (status != B_OK) 337 return status; 338 339 if (socket->options & SO_LINGER) { 340 TRACE("Close(): Lingering for %i secs", socket->linger); 341 342 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 343 344 while (fSendQueue.Used() > 0) { 345 status = fSendList.Wait(lock, maximum); 346 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 347 break; 348 else if (status < B_OK) 349 return status; 350 } 351 352 TRACE("Close(): after waiting, the SendQ was left with %lu bytes.", 353 fSendQueue.Used()); 354 } 355 356 // TODO: do i need to wait until fState returns to CLOSED? 357 return B_OK; 358 } 359 360 361 status_t 362 TCPEndpoint::Free() 363 { 364 TRACE("Free()"); 365 366 MutexLocker _(fLock); 367 368 if (fState <= SYNCHRONIZE_SENT || fState == TIME_WAIT) 369 return B_OK; 370 371 // we are only interested in the timer, not in changing state 372 _EnterTimeWait(); 373 return B_BUSY; 374 // we'll be freed later when the 2MSL timer expires 375 } 376 377 378 /*! 379 Creates and sends a synchronize packet to /a address, and then waits 380 until the connection has been established or refused. 381 */ 382 status_t 383 TCPEndpoint::Connect(const sockaddr *address) 384 { 385 TRACE("Connect() on address %s", PrintAddress(address)); 386 387 MutexLocker locker(fLock); 388 389 // Can only call connect() from CLOSED or LISTEN states 390 // otherwise endpoint is considered already connected 391 if (fState == LISTEN) { 392 // this socket is about to connect; remove pending connections in the backlog 393 gSocketModule->set_max_backlog(socket, 0); 394 } else if (fState != CLOSED) 395 return EISCONN; 396 397 status_t status = _PrepareSendPath(address); 398 if (status < B_OK) 399 return status; 400 401 TRACE(" Connect(): starting 3-way handshake..."); 402 403 fState = SYNCHRONIZE_SENT; 404 405 // send SYN 406 status = _SendQueued(); 407 if (status != B_OK) { 408 fState = CLOSED; 409 return status; 410 } 411 412 // If we are running over Loopback, after _SendQueued() returns we 413 // may be in ESTABLISHED already. 414 if (fState == ESTABLISHED) { 415 TRACE(" Connect() completed after _SendQueued()"); 416 return B_OK; 417 } 418 419 // wait until 3-way handshake is complete (if needed) 420 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 421 if (timeout == 0) { 422 // we're a non-blocking socket 423 TRACE(" Connect() delayed, return EINPROGRESS"); 424 return EINPROGRESS; 425 } 426 427 status = _WaitForEstablished(locker, absolute_timeout(timeout)); 428 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 429 strerror(status), timeout); 430 return posix_error(status); 431 } 432 433 434 status_t 435 TCPEndpoint::Accept(struct net_socket **_acceptedSocket) 436 { 437 TRACE("Accept()"); 438 439 MutexLocker locker(fLock); 440 441 status_t status; 442 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 443 444 do { 445 locker.Unlock(); 446 447 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 448 | B_CAN_INTERRUPT, timeout); 449 if (status < B_OK) 450 return status; 451 452 locker.Lock(); 453 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 454 if (status == B_OK) 455 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 456 } while (status < B_OK); 457 458 return status; 459 } 460 461 462 status_t 463 TCPEndpoint::Bind(const sockaddr *address) 464 { 465 if (address == NULL) 466 return B_BAD_VALUE; 467 468 MutexLocker lock(fLock); 469 470 TRACE("Bind() on address %s", PrintAddress(address)); 471 472 if (fState != CLOSED) 473 return EISCONN; 474 475 return fManager->Bind(this, address); 476 } 477 478 479 status_t 480 TCPEndpoint::Unbind(struct sockaddr *address) 481 { 482 TRACE("Unbind()"); 483 484 MutexLocker lock(fLock); 485 return fManager->Unbind(this); 486 } 487 488 489 status_t 490 TCPEndpoint::Listen(int count) 491 { 492 TRACE("Listen()"); 493 494 MutexLocker lock(fLock); 495 496 if (fState != CLOSED) 497 return B_BAD_VALUE; 498 499 fAcceptSemaphore = create_sem(0, "tcp accept"); 500 if (fAcceptSemaphore < B_OK) 501 return ENOBUFS; 502 503 status_t status = fManager->SetPassive(this); 504 if (status < B_OK) { 505 delete_sem(fAcceptSemaphore); 506 fAcceptSemaphore = -1; 507 return status; 508 } 509 510 fState = LISTEN; 511 return B_OK; 512 } 513 514 515 status_t 516 TCPEndpoint::Shutdown(int direction) 517 { 518 TRACE("Shutdown(%i)", direction); 519 520 MutexLocker lock(fLock); 521 522 if (direction == SHUT_RD || direction == SHUT_RDWR) 523 fFlags |= FLAG_NO_RECEIVE; 524 525 if (direction == SHUT_WR || direction == SHUT_RDWR) 526 _ShutdownEgress(false); 527 528 return B_OK; 529 } 530 531 532 /*! 533 Puts data contained in \a buffer into send buffer 534 */ 535 status_t 536 TCPEndpoint::SendData(net_buffer *buffer) 537 { 538 MutexLocker lock(fLock); 539 540 TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]", 541 buffer, buffer->size, buffer->flags, fSendQueue.Size(), 542 fSendQueue.Free()); 543 544 if (fState == CLOSED) 545 return ENOTCONN; 546 else if (fState == LISTEN) { 547 return EDESTADDRREQ; 548 } else if (fState == FINISH_SENT || fState == FINISH_ACKNOWLEDGED 549 || fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 550 || fState == TIME_WAIT) { 551 // TODO: send SIGPIPE signal to app? 552 return EPIPE; 553 } 554 555 if (buffer->size > 0) { 556 if (buffer->size > fSendQueue.Size()) 557 return EMSGSIZE; 558 559 bigtime_t timeout = absolute_timeout(socket->send.timeout); 560 561 while (fSendQueue.Free() < buffer->size) { 562 status_t status = fSendList.Wait(lock, timeout); 563 if (status < B_OK) { 564 TRACE(" SendData() returning %s (%d)", 565 strerror(posix_error(status)), (int)posix_error(status)); 566 return posix_error(status); 567 } 568 } 569 570 fSendQueue.Add(buffer); 571 } 572 573 TRACE(" SendData(): %lu bytes used.", fSendQueue.Used()); 574 575 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 576 _SendQueued(); 577 578 return B_OK; 579 } 580 581 582 ssize_t 583 TCPEndpoint::SendAvailable() 584 { 585 MutexLocker locker(fLock); 586 587 ssize_t available; 588 589 if (is_writable(fState)) 590 available = fSendQueue.Free(); 591 else 592 available = EPIPE; 593 594 TRACE("SendAvailable(): %li", available); 595 return available; 596 } 597 598 599 status_t 600 TCPEndpoint::FillStat(net_stat *stat) 601 { 602 MutexLocker _(fLock); 603 604 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 605 stat->receive_queue_size = fReceiveQueue.Available(); 606 stat->send_queue_size = fSendQueue.Used(); 607 608 return B_OK; 609 } 610 611 612 status_t 613 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 614 { 615 TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags); 616 617 MutexLocker locker(fLock); 618 619 *_buffer = NULL; 620 621 if (fState == CLOSED) 622 return ENOTCONN; 623 624 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 625 626 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 627 if (flags & MSG_DONTWAIT) 628 return B_WOULD_BLOCK; 629 630 status_t status = _WaitForEstablished(locker, timeout); 631 if (status < B_OK) 632 return posix_error(status); 633 } 634 635 size_t dataNeeded = socket->receive.low_water_mark; 636 637 // When MSG_WAITALL is set then the function should block 638 // until the full amount of data can be returned. 639 if (flags & MSG_WAITALL) 640 dataNeeded = numBytes; 641 642 // TODO: add support for urgent data (MSG_OOB) 643 644 while (true) { 645 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 646 || fState == TIME_WAIT) { 647 // ``Connection closing''. 648 return B_OK; 649 } 650 651 if (fReceiveQueue.Available() > 0) { 652 if (fReceiveQueue.Available() >= dataNeeded || 653 ((fReceiveQueue.PushedData() > 0) 654 && (fReceiveQueue.PushedData() >= fReceiveQueue.Available()))) 655 break; 656 } else if (fState == FINISH_RECEIVED) { 657 // ``If no text is awaiting delivery, the RECEIVE will 658 // get a Connection closing''. 659 return B_OK; 660 } 661 662 if ((flags & MSG_DONTWAIT) || (socket->receive.timeout == 0)) 663 return B_WOULD_BLOCK; 664 665 status_t status = fReceiveList.Wait(locker, timeout, false); 666 if (status < B_OK) { 667 // The Open Group base specification mentions that EINTR should be 668 // returned if the recv() is interrupted before _any data_ is 669 // available. So we actually check if there is data, and if so, 670 // push it to the user. 671 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 672 && fReceiveQueue.Available() > 0) 673 break; 674 675 return posix_error(status); 676 } 677 } 678 679 TRACE(" ReadData(): %lu are available.", fReceiveQueue.Available()); 680 681 if (numBytes < fReceiveQueue.Available()) 682 fReceiveList.Signal(); 683 684 bool clone = (flags & MSG_PEEK); 685 686 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 687 688 TRACE(" ReadData(): %lu bytes kept.", fReceiveQueue.Available()); 689 690 // if we are opening the window, check if we should send an ACK 691 if (!clone) 692 SendAcknowledge(false); 693 694 return receivedBytes; 695 } 696 697 698 ssize_t 699 TCPEndpoint::ReadAvailable() 700 { 701 MutexLocker locker(fLock); 702 703 TRACE("ReadAvailable(): %li", _AvailableData()); 704 705 return _AvailableData(); 706 } 707 708 709 status_t 710 TCPEndpoint::SetSendBufferSize(size_t length) 711 { 712 MutexLocker _(fLock); 713 fSendQueue.SetMaxBytes(length); 714 return B_OK; 715 } 716 717 718 status_t 719 TCPEndpoint::SetReceiveBufferSize(size_t length) 720 { 721 MutexLocker _(fLock); 722 fReceiveQueue.SetMaxBytes(length); 723 return B_OK; 724 } 725 726 727 status_t 728 TCPEndpoint::SetOption(int option, const void *_value, int length) 729 { 730 if (option != TCP_NODELAY) 731 return B_BAD_VALUE; 732 733 if (length != sizeof(int)) 734 return B_BAD_VALUE; 735 736 const int *value = (const int *)_value; 737 738 MutexLocker _(fLock); 739 if (*value) 740 fOptions |= TCP_NODELAY; 741 else 742 fOptions &= ~TCP_NODELAY; 743 744 return B_OK; 745 } 746 747 748 // #pragma mark - misc 749 750 751 bool 752 TCPEndpoint::IsBound() const 753 { 754 return !LocalAddress().IsEmpty(true); 755 } 756 757 758 void 759 TCPEndpoint::DeleteSocket() 760 { 761 // the next call will delete `this'. 762 gSocketModule->delete_socket(socket); 763 } 764 765 766 status_t 767 TCPEndpoint::DelayedAcknowledge() 768 { 769 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 770 // timer was active, send an ACK now (with the exception above, 771 // we send every other ACK) 772 return SendAcknowledge(true); 773 } 774 775 gStackModule->set_timer(&fDelayedAcknowledgeTimer, TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 776 return B_OK; 777 } 778 779 780 status_t 781 TCPEndpoint::SendAcknowledge(bool force) 782 { 783 return _SendQueued(force, 0); 784 } 785 786 787 void 788 TCPEndpoint::_StartPersistTimer() 789 { 790 gStackModule->set_timer(&fPersistTimer, 1000000LL); 791 } 792 793 794 void 795 TCPEndpoint::_EnterTimeWait() 796 { 797 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 798 } 799 800 801 status_t 802 TCPEndpoint::UpdateTimeWait() 803 { 804 return B_OK; 805 } 806 807 808 // #pragma mark - receive 809 810 811 int32 812 TCPEndpoint::_ListenReceive(tcp_segment_header &segment, net_buffer *buffer) 813 { 814 TRACE("ListenReceive()"); 815 816 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 817 // but the error behaviour differs 818 if (segment.flags & TCP_FLAG_RESET) 819 return DROP; 820 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 821 return DROP | RESET; 822 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 823 return DROP; 824 825 // TODO: drop broadcast/multicast 826 827 // spawn new endpoint for accept() 828 net_socket *newSocket; 829 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) 830 return DROP; 831 832 return ((TCPEndpoint *)newSocket->first_protocol)->Spawn(this, 833 segment, buffer); 834 } 835 836 837 int32 838 TCPEndpoint::Spawn(TCPEndpoint *parent, tcp_segment_header &segment, 839 net_buffer *buffer) 840 { 841 MutexLocker _(fLock); 842 843 // TODO error checking 844 ProtocolSocket::Open(); 845 846 fState = SYNCHRONIZE_RECEIVED; 847 fManager = parent->fManager; 848 849 LocalAddress().SetTo(buffer->destination); 850 PeerAddress().SetTo(buffer->source); 851 852 TRACE("Spawn()"); 853 854 // TODO: proper error handling! 855 if (fManager->BindChild(this) < B_OK) 856 return DROP; 857 858 if (_PrepareSendPath(*PeerAddress()) < B_OK) 859 return DROP; 860 861 fOptions = parent->fOptions; 862 fAcceptSemaphore = parent->fAcceptSemaphore; 863 864 _PrepareReceivePath(segment); 865 866 // send SYN+ACK 867 if (_SendQueued() < B_OK) 868 return DROP; 869 870 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 871 // we handled this flag now, it must not be set for further processing 872 873 return _Receive(segment, buffer); 874 } 875 876 877 void 878 TCPEndpoint::DumpInternalState() const 879 { 880 kprintf("Lock: { sem: %ld, holder: %ld }\n", fLock.sem, fLock.holder); 881 kprintf("AcceptSem: %ld\n", fAcceptSemaphore); 882 kprintf("Options: 0x%lx\n", (uint32)fOptions); 883 kprintf("SendWindowShift: %lu\n", (uint32)fSendWindowShift); 884 kprintf("ReceiveWindowShift: %lu\n", (uint32)fReceiveWindowShift); 885 kprintf("SendUnacknowledged: %lu\n", (uint32)fSendUnacknowledged); 886 kprintf("SendNext: %lu\n", (uint32)fSendNext); 887 kprintf("SendMax: %lu\n", (uint32)fSendMax); 888 kprintf("SendWindow: %lu\n", fSendWindow); 889 kprintf("SendMaxWindow: %lu\n", fSendMaxWindow); 890 kprintf("SendMaxSegmentSize: %lu\n", fSendMaxSegmentSize); 891 kprintf("Send-Q: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size()); 892 kprintf("LastAcknowledgeSent: %lu\n", (uint32)fLastAcknowledgeSent); 893 kprintf("InitialSendSequence: %lu\n", (uint32)fInitialSendSequence); 894 kprintf("DuplicateAcknowledgeCount: %lu\n", fDuplicateAcknowledgeCount); 895 kprintf("ReceiveNext: %lu\n", (uint32)fReceiveNext); 896 kprintf("ReceiveMaxAdvertised: %lu\n", (uint32)fReceiveMaxAdvertised); 897 kprintf("ReceiveWindow: %lu\n", (uint32)fReceiveWindow); 898 kprintf("ReceiveMaxSegmentSize: %lu\n", (uint32)fReceiveMaxSegmentSize); 899 kprintf("Recv-Q: %lu / %lu\n", fReceiveQueue.Available(), 900 fReceiveQueue.Size()); 901 kprintf("InitialReceiveSequence: %lu\n", (uint32)fInitialReceiveSequence); 902 kprintf("RoundTripTime: %ld (dev %ld)\n", fRoundTripTime, 903 fRoundTripDeviation); 904 kprintf("RetransmitTimeout: %llu\n", (uint64)fRetransmitTimeout); 905 kprintf("CongestionWindow: %lu\n", fCongestionWindow); 906 kprintf("SlowStartThreshold: %lu\n", fSlowStartThreshold); 907 kprintf("State: %s\n", name_for_state(fState)); 908 kprintf("Flags: 0x%lx\n", fFlags); 909 } 910 911 912 int32 913 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, net_buffer *buffer) 914 { 915 TRACE("SynchronizeSentReceive()"); 916 917 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 918 && (fInitialSendSequence >= segment.acknowledge 919 || fSendMax < segment.acknowledge)) 920 return DROP | RESET; 921 922 if (segment.flags & TCP_FLAG_RESET) { 923 fError = ECONNREFUSED; 924 fState = CLOSED; 925 return DROP; 926 } 927 928 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 929 return DROP; 930 931 fSendUnacknowledged = segment.acknowledge; 932 _PrepareReceivePath(segment); 933 934 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 935 _MarkEstablished(); 936 } else { 937 // simultaneous open 938 fState = SYNCHRONIZE_RECEIVED; 939 } 940 941 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 942 // we handled this flag now, it must not be set for further processing 943 944 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 945 } 946 947 948 int32 949 TCPEndpoint::SegmentReceived(tcp_segment_header &segment, net_buffer *buffer) 950 { 951 MutexLocker locker(fLock); 952 953 TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s", 954 buffer, buffer->size, PrintAddress(buffer->source), 955 PrintAddress(buffer->destination)); 956 TRACE(" flags 0x%x, seq %lu, ack %lu, wnd %lu", 957 segment.flags, segment.sequence, segment.acknowledge, 958 (uint32)segment.advertised_window << fSendWindowShift); 959 960 int32 segmentAction = DROP; 961 962 switch (fState) { 963 case LISTEN: 964 segmentAction = _ListenReceive(segment, buffer); 965 break; 966 967 case SYNCHRONIZE_SENT: 968 segmentAction = _SynchronizeSentReceive(segment, buffer); 969 break; 970 971 case SYNCHRONIZE_RECEIVED: 972 case ESTABLISHED: 973 case FINISH_RECEIVED: 974 case WAIT_FOR_FINISH_ACKNOWLEDGE: 975 case FINISH_SENT: 976 case FINISH_ACKNOWLEDGED: 977 case CLOSING: 978 case TIME_WAIT: 979 case CLOSED: 980 segmentAction = _SegmentReceived(segment, buffer); 981 break; 982 } 983 984 // process acknowledge action as asked for by the *Receive() method 985 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 986 SendAcknowledge(true); 987 else if (segmentAction & ACKNOWLEDGE) 988 DelayedAcknowledge(); 989 990 return segmentAction; 991 } 992 993 int32 994 TCPEndpoint::_SegmentReceived(tcp_segment_header &segment, net_buffer *buffer) 995 { 996 uint32 advertisedWindow = (uint32)segment.advertised_window << fSendWindowShift; 997 998 // First, handle the most common case for uni-directional data transfer 999 // (known as header prediction - the segment must not change the window, 1000 // and must be the expected sequence, and contain no control flags) 1001 1002 if (fState == ESTABLISHED 1003 && segment.AcknowledgeOnly() 1004 && fReceiveNext == segment.sequence 1005 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1006 && fSendNext == fSendMax) { 1007 1008 _UpdateTimestamps(segment, buffer->size); 1009 1010 if (buffer->size == 0) { 1011 // this is a pure acknowledge segment - we're on the sending end 1012 if (fSendUnacknowledged < segment.acknowledge 1013 && fSendMax >= segment.acknowledge) { 1014 _Acknowledged(segment); 1015 return DROP; 1016 } 1017 } else if (segment.acknowledge == fSendUnacknowledged 1018 && fReceiveQueue.IsContiguous() 1019 && fReceiveQueue.Free() >= buffer->size 1020 && !(fFlags & FLAG_NO_RECEIVE)) { 1021 if (_AddData(segment, buffer)) 1022 _NotifyReader(); 1023 return KEEP | ((segment.flags & TCP_FLAG_PUSH) ? 1024 IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1025 } 1026 } 1027 1028 // The fast path was not applicable, so we continue with the standard 1029 // processing of the incoming segment 1030 1031 if (fState != SYNCHRONIZE_SENT && fState != LISTEN && fState != CLOSED) { 1032 // 1. check sequence number 1033 if (!segment_in_sequence(segment, buffer->size, fReceiveNext, 1034 fReceiveWindow)) { 1035 TRACE(" Receive(): segment out of window, next: %lu wnd: %lu", 1036 (uint32)fReceiveNext, fReceiveWindow); 1037 if (segment.flags & TCP_FLAG_RESET) 1038 return DROP; 1039 return DROP | IMMEDIATE_ACKNOWLEDGE; 1040 } 1041 } 1042 1043 return _Receive(segment, buffer); 1044 } 1045 1046 1047 // #pragma mark - send 1048 1049 1050 inline uint8 1051 TCPEndpoint::_CurrentFlags() 1052 { 1053 // we don't set FLAG_FINISH here, instead we do it 1054 // conditionally below depending if we are sending 1055 // the last bytes of the send queue. 1056 1057 switch (fState) { 1058 case CLOSED: 1059 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1060 1061 case SYNCHRONIZE_SENT: 1062 return TCP_FLAG_SYNCHRONIZE; 1063 case SYNCHRONIZE_RECEIVED: 1064 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1065 1066 case ESTABLISHED: 1067 case FINISH_RECEIVED: 1068 case FINISH_ACKNOWLEDGED: 1069 case TIME_WAIT: 1070 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1071 case FINISH_SENT: 1072 case CLOSING: 1073 return TCP_FLAG_ACKNOWLEDGE; 1074 1075 default: 1076 return B_ERROR; 1077 } 1078 } 1079 1080 1081 inline bool 1082 TCPEndpoint::_ShouldSendSegment(tcp_segment_header &segment, uint32 length, 1083 uint32 segmentMaxSize, uint32 flightSize) 1084 { 1085 if (length > 0) { 1086 // Avoid the silly window syndrome - we only send a segment in case: 1087 // - we have a full segment to send, or 1088 // - we're at the end of our buffer queue, or 1089 // - the buffer is at least larger than half of the maximum send window, or 1090 // - we're retransmitting data 1091 if (length == segmentMaxSize 1092 || (fOptions & TCP_NODELAY) != 0 1093 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1094 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1095 return true; 1096 } 1097 1098 // check if we need to send a window update to the peer 1099 if (segment.advertised_window > 0) { 1100 // correct the window to take into account what already has been advertised 1101 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1102 - (fReceiveMaxAdvertised - fReceiveNext); 1103 1104 // if we can advertise a window larger than twice the maximum segment 1105 // size, or half the maximum buffer size we send a window update 1106 if (window >= (fReceiveMaxSegmentSize << 1) 1107 || window >= (socket->receive.buffer_size >> 1)) 1108 return true; 1109 } 1110 1111 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH | TCP_FLAG_RESET)) != 0) 1112 return true; 1113 1114 // there is no reason to send a segment just now 1115 return false; 1116 } 1117 1118 1119 status_t 1120 TCPEndpoint::_SendQueued(bool force) 1121 { 1122 return _SendQueued(force, fSendWindow); 1123 } 1124 1125 1126 /*! 1127 Sends one or more TCP segments with the data waiting in the queue, or some 1128 specific flags that need to be sent. 1129 */ 1130 status_t 1131 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 1132 { 1133 if (fRoute == NULL) 1134 return B_ERROR; 1135 1136 // in passive state? 1137 if (fState == LISTEN) 1138 return B_ERROR; 1139 1140 tcp_segment_header segment(_CurrentFlags()); 1141 1142 if ((fOptions & TCP_NOOPT) == 0) { 1143 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1144 segment.options |= TCP_HAS_TIMESTAMPS; 1145 segment.timestamp_reply = fReceivedTimestamp; 1146 segment.timestamp_value = tcp_now(); 1147 } 1148 1149 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) 1150 && (fSendNext == fInitialSendSequence)) { 1151 // add connection establishment options 1152 segment.max_segment_size = fReceiveMaxSegmentSize; 1153 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 1154 segment.options |= TCP_HAS_WINDOW_SCALE; 1155 segment.window_shift = fReceiveWindowShift; 1156 } 1157 } 1158 } 1159 1160 size_t availableBytes = fReceiveQueue.Free(); 1161 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 1162 segment.advertised_window = availableBytes >> fReceiveWindowShift; 1163 else 1164 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 1165 1166 segment.acknowledge = fReceiveNext; 1167 segment.urgent_offset = 0; 1168 1169 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 1170 sendWindow = fCongestionWindow; 1171 1172 // SND.UNA SND.NXT SND.MAX 1173 // | | | 1174 // v v v 1175 // ----------------------------------- 1176 // | effective window | 1177 // ----------------------------------- 1178 1179 // Flight size represents the window of data which is currently in the 1180 // ether. We should never send data such as the flight size becomes larger 1181 // than the effective window. Note however that the effective window may be 1182 // reduced (by congestion for instance), so at some point in time flight 1183 // size may be larger than the currently calculated window. 1184 1185 uint32 flightSize = fSendMax - fSendUnacknowledged; 1186 uint32 consumedWindow = fSendNext - fSendUnacknowledged; 1187 1188 if (consumedWindow > sendWindow) { 1189 sendWindow = 0; 1190 // TODO enter persist state? try to get a window update. 1191 } else 1192 sendWindow -= consumedWindow; 1193 1194 if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) { 1195 // send one byte of data to ask for a window update 1196 // (triggered by the persist timer) 1197 sendWindow = 1; 1198 } 1199 1200 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 1201 tcp_sequence previousSendNext = fSendNext; 1202 1203 do { 1204 uint32 segmentMaxSize = fSendMaxSegmentSize 1205 - tcp_options_length(segment); 1206 uint32 segmentLength = min_c(length, segmentMaxSize); 1207 1208 if (fSendNext + segmentLength == fSendQueue.LastSequence()) { 1209 if (state_needs_finish(fState)) 1210 segment.flags |= TCP_FLAG_FINISH; 1211 if (length > 0) 1212 segment.flags |= TCP_FLAG_PUSH; 1213 } 1214 1215 // Determine if we should really send this segment 1216 if (!force && !_ShouldSendSegment(segment, segmentLength, 1217 segmentMaxSize, flightSize)) { 1218 if (fSendQueue.Available() 1219 && !gStackModule->is_timer_active(&fPersistTimer) 1220 && !gStackModule->is_timer_active(&fRetransmitTimer)) 1221 _StartPersistTimer(); 1222 break; 1223 } 1224 1225 net_buffer *buffer = gBufferModule->create(256); 1226 if (buffer == NULL) 1227 return B_NO_MEMORY; 1228 1229 status_t status = B_OK; 1230 if (segmentLength > 0) 1231 fSendQueue.Get(buffer, fSendNext, segmentLength); 1232 if (status < B_OK) { 1233 gBufferModule->free(buffer); 1234 return status; 1235 } 1236 1237 LocalAddress().CopyTo(buffer->source); 1238 PeerAddress().CopyTo(buffer->destination); 1239 1240 uint32 size = buffer->size; 1241 segment.sequence = fSendNext; 1242 1243 TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s", 1244 buffer, buffer->size, PrintAddress(buffer->source), 1245 PrintAddress(buffer->destination)); 1246 TRACE(" flags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu" 1247 ", ssthresh %lu", segment.flags, segment.sequence, 1248 segment.acknowledge, segment.advertised_window, 1249 fCongestionWindow, fSlowStartThreshold); 1250 TRACE(" len %lu first %lu last %lu", segmentLength, 1251 (uint32)fSendQueue.FirstSequence(), 1252 (uint32)fSendQueue.LastSequence()); 1253 1254 PROBE(buffer, sendWindow); 1255 sendWindow -= buffer->size; 1256 1257 status = add_tcp_header(AddressModule(), segment, buffer); 1258 if (status != B_OK) { 1259 gBufferModule->free(buffer); 1260 return status; 1261 } 1262 1263 // Update send status - we need to do this before we send the data 1264 // for local connections as the answer is directly handled 1265 1266 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 1267 segment.options &= ~TCP_HAS_WINDOW_SCALE; 1268 segment.max_segment_size = 0; 1269 size++; 1270 } 1271 1272 if (segment.flags & TCP_FLAG_FINISH) 1273 size++; 1274 1275 uint32 sendMax = fSendMax; 1276 fSendNext += size; 1277 if (fSendMax < fSendNext) 1278 fSendMax = fSendNext; 1279 1280 fReceiveMaxAdvertised = fReceiveNext 1281 + ((uint32)segment.advertised_window << fReceiveWindowShift); 1282 1283 status = next->module->send_routed_data(next, fRoute, buffer); 1284 if (status < B_OK) { 1285 gBufferModule->free(buffer); 1286 1287 fSendNext = segment.sequence; 1288 fSendMax = sendMax; 1289 // restore send status 1290 return status; 1291 } 1292 1293 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1294 fLastAcknowledgeSent = segment.acknowledge; 1295 1296 length -= segmentLength; 1297 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET | TCP_FLAG_FINISH); 1298 } while (length > 0); 1299 1300 // if we sent data from the beggining of the send queue, 1301 // start the retransmition timer 1302 if (previousSendNext == fSendUnacknowledged 1303 && fSendNext > previousSendNext) { 1304 TRACE(" SendQueue(): set retransmit timer with rto %llu", 1305 fRetransmitTimeout); 1306 1307 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 1308 } 1309 1310 return B_OK; 1311 } 1312 1313 1314 int 1315 TCPEndpoint::_GetMSS(const sockaddr *address) const 1316 { 1317 return next->module->get_mtu(next, address) - sizeof(tcp_header); 1318 } 1319 1320 1321 status_t 1322 TCPEndpoint::_ShutdownEgress(bool closing) 1323 { 1324 tcp_state previousState = fState; 1325 1326 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1327 fState = FINISH_SENT; 1328 else if (fState == FINISH_RECEIVED) 1329 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1330 else 1331 return B_OK; 1332 1333 status_t status = _SendQueued(); 1334 if (status != B_OK) { 1335 fState = previousState; 1336 return status; 1337 } 1338 1339 return B_OK; 1340 } 1341 1342 1343 ssize_t 1344 TCPEndpoint::_AvailableData() const 1345 { 1346 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1347 // the application of FLAG_NO_RECEIVE in listen()ing 1348 // sockets. 1349 if (fState == LISTEN) 1350 return gSocketModule->count_connected(socket); 1351 else if (fState == SYNCHRONIZE_SENT) 1352 return 0; 1353 1354 ssize_t availableData = fReceiveQueue.Available(); 1355 1356 if (availableData == 0 && !_ShouldReceive()) 1357 return ENOTCONN; 1358 1359 return availableData; 1360 } 1361 1362 1363 void 1364 TCPEndpoint::_NotifyReader() 1365 { 1366 fReceiveList.Signal(); 1367 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1368 } 1369 1370 1371 bool 1372 TCPEndpoint::_ShouldReceive() const 1373 { 1374 if (fFlags & FLAG_NO_RECEIVE) 1375 return false; 1376 1377 return fState == ESTABLISHED || fState == FINISH_SENT 1378 || fState == FINISH_ACKNOWLEDGED; 1379 } 1380 1381 1382 int32 1383 TCPEndpoint::_Receive(tcp_segment_header &segment, net_buffer *buffer) 1384 { 1385 uint32 advertisedWindow = (uint32)segment.advertised_window << fSendWindowShift; 1386 1387 size_t segmentLength = buffer->size; 1388 1389 if (segment.flags & TCP_FLAG_RESET) { 1390 // is this a valid reset? 1391 if (fLastAcknowledgeSent <= segment.sequence 1392 && tcp_sequence(segment.sequence) 1393 < (fLastAcknowledgeSent + fReceiveWindow)) { 1394 if (fState == SYNCHRONIZE_RECEIVED) 1395 fError = ECONNREFUSED; 1396 else if (fState == CLOSING || fState == TIME_WAIT 1397 || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1398 fError = ENOTCONN; 1399 else 1400 fError = ECONNRESET; 1401 1402 _NotifyReader(); 1403 fState = CLOSED; 1404 } 1405 1406 return DROP; 1407 } 1408 1409 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1410 || (fState == SYNCHRONIZE_RECEIVED 1411 && (fInitialReceiveSequence > segment.sequence 1412 || (segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1413 && (fSendUnacknowledged > segment.acknowledge 1414 || fSendMax < segment.acknowledge)))) { 1415 // reset the connection - either the initial SYN was faulty, or we 1416 // received a SYN within the data stream 1417 return DROP | RESET; 1418 } 1419 1420 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1421 // the window must not shrink 1422 1423 // trim buffer to be within the receive window 1424 int32 drop = fReceiveNext - segment.sequence; 1425 if (drop > 0) { 1426 if ((uint32)drop > buffer->size 1427 || ((uint32)drop == buffer->size 1428 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1429 // don't accidently remove a FIN we shouldn't remove 1430 segment.flags &= ~TCP_FLAG_FINISH; 1431 drop = buffer->size; 1432 } 1433 1434 // remove duplicate data at the start 1435 TRACE("* remove %ld bytes from the start", drop); 1436 gBufferModule->remove_header(buffer, drop); 1437 segment.sequence += drop; 1438 } 1439 1440 int32 action = KEEP; 1441 1442 drop = segment.sequence + buffer->size - (fReceiveNext + fReceiveWindow); 1443 if (drop > 0) { 1444 // remove data exceeding our window 1445 if ((uint32)drop >= buffer->size) { 1446 // if we can accept data, or the segment is not what we'd expect, 1447 // drop the segment (an immediate acknowledge is always triggered) 1448 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1449 return DROP | IMMEDIATE_ACKNOWLEDGE; 1450 1451 action |= IMMEDIATE_ACKNOWLEDGE; 1452 } 1453 1454 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1455 // we need to remove the finish, too, as part of the data 1456 drop--; 1457 } 1458 1459 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1460 TRACE("* remove %ld bytes from the end", drop); 1461 gBufferModule->remove_trailer(buffer, drop); 1462 } 1463 1464 if (advertisedWindow > fSendWindow) 1465 TRACE(" Receive(): Window update %lu -> %lu", fSendWindow, 1466 advertisedWindow); 1467 1468 fSendWindow = advertisedWindow; 1469 if (advertisedWindow > fSendMaxWindow) 1470 fSendMaxWindow = advertisedWindow; 1471 1472 // Then look at the acknowledgement for any updates 1473 1474 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1475 // process acknowledged data 1476 if (fState == SYNCHRONIZE_RECEIVED) 1477 _MarkEstablished(); 1478 1479 if (fSendMax < segment.acknowledge || fState == TIME_WAIT) 1480 return DROP | IMMEDIATE_ACKNOWLEDGE; 1481 1482 if (segment.acknowledge < fSendUnacknowledged) { 1483 if (buffer->size == 0 && advertisedWindow == fSendWindow 1484 && (segment.flags & TCP_FLAG_FINISH) == 0) { 1485 TRACE("Receive(): duplicate ack!"); 1486 1487 _DuplicateAcknowledge(segment); 1488 } 1489 1490 return DROP; 1491 } else { 1492 // this segment acknowledges in flight data 1493 1494 if (fDuplicateAcknowledgeCount >= 3) { 1495 // deflate the window. 1496 fCongestionWindow = fSlowStartThreshold; 1497 } 1498 1499 fDuplicateAcknowledgeCount = 0; 1500 1501 if (fSendMax == segment.acknowledge) 1502 TRACE("Receive(): all inflight data ack'd!"); 1503 1504 if (segment.acknowledge > fSendQueue.LastSequence() 1505 && fState > ESTABLISHED) { 1506 TRACE("Receive(): FIN has been acknowledged!"); 1507 1508 switch (fState) { 1509 case FINISH_SENT: 1510 fState = FINISH_ACKNOWLEDGED; 1511 break; 1512 case CLOSING: 1513 fState = TIME_WAIT; 1514 _EnterTimeWait(); 1515 return DROP; 1516 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1517 fState = CLOSED; 1518 break; 1519 1520 default: 1521 break; 1522 } 1523 } 1524 1525 if (fState != CLOSED) 1526 _Acknowledged(segment); 1527 } 1528 } 1529 1530 if (segment.flags & TCP_FLAG_URGENT) { 1531 if (fState == ESTABLISHED || fState == FINISH_SENT 1532 || fState == FINISH_ACKNOWLEDGED) { 1533 // TODO: Handle urgent data: 1534 // - RCV.UP <- max(RCV.UP, SEG.UP) 1535 // - signal the user that urgent data is available (SIGURG) 1536 } 1537 } 1538 1539 bool notify = false; 1540 1541 if (buffer->size > 0 && _ShouldReceive()) 1542 notify = _AddData(segment, buffer); 1543 else 1544 action = (action & ~KEEP) | DROP; 1545 1546 if (segment.flags & TCP_FLAG_FINISH) { 1547 segmentLength++; 1548 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1549 TRACE("Receive(): peer is finishing connection!"); 1550 fReceiveNext++; 1551 notify = true; 1552 1553 // FIN implies PSH 1554 fReceiveQueue.SetPushPointer(); 1555 1556 // we'll reply immediatly to the FIN if we are not 1557 // transitioning to TIME WAIT so we immediatly ACK it. 1558 action |= IMMEDIATE_ACKNOWLEDGE; 1559 1560 // other side is closing connection; change states 1561 switch (fState) { 1562 case ESTABLISHED: 1563 case SYNCHRONIZE_RECEIVED: 1564 fState = FINISH_RECEIVED; 1565 break; 1566 case FINISH_SENT: 1567 // simultaneous close 1568 fState = CLOSING; 1569 break; 1570 case FINISH_ACKNOWLEDGED: 1571 fState = TIME_WAIT; 1572 _EnterTimeWait(); 1573 break; 1574 default: 1575 break; 1576 } 1577 } 1578 } 1579 1580 if (notify) 1581 _NotifyReader(); 1582 1583 if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1584 action |= ACKNOWLEDGE; 1585 1586 _UpdateTimestamps(segment, segmentLength); 1587 1588 TRACE("Receive() Action %ld", action); 1589 1590 return action; 1591 } 1592 1593 1594 void 1595 TCPEndpoint::_UpdateTimestamps(tcp_segment_header &segment, 1596 size_t segmentLength) 1597 { 1598 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1599 tcp_sequence sequence(segment.sequence); 1600 1601 if ((fLastAcknowledgeSent >= sequence 1602 && fLastAcknowledgeSent < (sequence + segmentLength))) 1603 fReceivedTimestamp = segment.timestamp_value; 1604 } 1605 } 1606 1607 1608 void 1609 TCPEndpoint::_MarkEstablished() 1610 { 1611 fState = ESTABLISHED; 1612 1613 if (socket->parent != NULL) { 1614 gSocketModule->set_connected(socket); 1615 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1616 } 1617 1618 fSendList.Signal(); 1619 } 1620 1621 1622 status_t 1623 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1624 { 1625 while (fState != ESTABLISHED) { 1626 status_t status = fSendList.Wait(locker, timeout); 1627 if (status < B_OK) 1628 return status; 1629 } 1630 1631 return B_OK; 1632 } 1633 1634 1635 bool 1636 TCPEndpoint::_AddData(tcp_segment_header &segment, net_buffer *buffer) 1637 { 1638 fReceiveQueue.Add(buffer, segment.sequence); 1639 fReceiveNext = fReceiveQueue.NextSequence(); 1640 1641 TRACE(" _AddData(): adding data, receive next = %lu. Now have %lu bytes.", 1642 (uint32)fReceiveNext, fReceiveQueue.Available()); 1643 1644 if (segment.flags & TCP_FLAG_PUSH) 1645 fReceiveQueue.SetPushPointer(); 1646 1647 return fReceiveQueue.Available() > 0; 1648 } 1649 1650 1651 void 1652 TCPEndpoint::_PrepareReceivePath(tcp_segment_header &segment) 1653 { 1654 fInitialReceiveSequence = segment.sequence; 1655 1656 // count the received SYN 1657 segment.sequence++; 1658 1659 fReceiveNext = segment.sequence; 1660 fReceiveQueue.SetInitialSequence(segment.sequence); 1661 1662 if ((fOptions & TCP_NOOPT) == 0) { 1663 if (segment.max_segment_size > 0) 1664 fSendMaxSegmentSize = segment.max_segment_size; 1665 1666 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1667 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1668 fSendWindowShift = segment.window_shift; 1669 } else { 1670 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1671 fReceiveWindowShift = 0; 1672 } 1673 1674 if (segment.options & TCP_HAS_TIMESTAMPS) { 1675 fFlags |= FLAG_OPTION_TIMESTAMP; 1676 fReceivedTimestamp = segment.timestamp_value; 1677 } else 1678 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1679 } 1680 1681 fCongestionWindow = 2 * fSendMaxSegmentSize; 1682 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1683 } 1684 1685 1686 status_t 1687 TCPEndpoint::_PrepareSendPath(const sockaddr *peer) 1688 { 1689 if (fRoute == NULL) { 1690 fRoute = gDatalinkModule->get_route(Domain(), peer); 1691 if (fRoute == NULL) 1692 return ENETUNREACH; 1693 } 1694 1695 // make sure connection does not already exist 1696 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 1697 fRoute->interface->address); 1698 if (status < B_OK) 1699 return status; 1700 1701 fInitialSendSequence = system_time() >> 4; 1702 fSendNext = fInitialSendSequence; 1703 fSendUnacknowledged = fInitialSendSequence; 1704 fSendMax = fInitialSendSequence; 1705 1706 // we are counting the SYN here 1707 fSendQueue.SetInitialSequence(fSendNext + 1); 1708 1709 fReceiveMaxSegmentSize = _GetMSS(peer); 1710 1711 // Compute the window shift we advertise to our peer - if it doesn't support 1712 // this option, this will be reset to 0 (when its SYN is received) 1713 fReceiveWindowShift = 0; 1714 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 1715 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 1716 fReceiveWindowShift++; 1717 } 1718 1719 return B_OK; 1720 } 1721 1722 1723 void 1724 TCPEndpoint::_Acknowledged(tcp_segment_header &segment) 1725 { 1726 size_t previouslyUsed = fSendQueue.Used(); 1727 1728 fSendQueue.RemoveUntil(segment.acknowledge); 1729 fSendUnacknowledged = segment.acknowledge; 1730 1731 if (fSendNext < fSendUnacknowledged) 1732 fSendNext = fSendUnacknowledged; 1733 1734 if (fSendUnacknowledged == fSendMax) 1735 gStackModule->cancel_timer(&fRetransmitTimer); 1736 1737 if (fSendQueue.Used() < previouslyUsed) { 1738 // this ACK acknowledged data 1739 1740 if (segment.options & TCP_HAS_TIMESTAMPS) 1741 _UpdateSRTT(tcp_diff_timestamp(segment.timestamp_reply)); 1742 else { 1743 // TODO Fallback to RFC 793 type estimation 1744 } 1745 1746 if (is_writable(fState)) { 1747 // notify threads waiting on the socket to become writable again 1748 fSendList.Signal(); 1749 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used()); 1750 } 1751 1752 if (fCongestionWindow < fSlowStartThreshold) 1753 fCongestionWindow += fSendMaxSegmentSize; 1754 } 1755 1756 if (fCongestionWindow >= fSlowStartThreshold) { 1757 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 1758 1759 if (increment < fCongestionWindow) 1760 increment = 1; 1761 else 1762 increment /= fCongestionWindow; 1763 1764 fCongestionWindow += increment; 1765 } 1766 1767 // if there is data left to be send, send it now 1768 if (fSendQueue.Used() > 0) 1769 _SendQueued(); 1770 } 1771 1772 1773 void 1774 TCPEndpoint::_Retransmit() 1775 { 1776 TRACE("Retransmit()"); 1777 _ResetSlowStart(); 1778 fSendNext = fSendUnacknowledged; 1779 _SendQueued(); 1780 } 1781 1782 1783 void 1784 TCPEndpoint::_UpdateSRTT(int32 roundTripTime) 1785 { 1786 int32 rtt = roundTripTime; 1787 1788 // Update_SRTT() as per Van Jacobson 1789 rtt -= (fRoundTripTime / 8); 1790 fRoundTripTime += rtt; 1791 if (rtt < 0) 1792 rtt = -rtt; 1793 rtt -= (fRoundTripDeviation / 4); 1794 fRoundTripDeviation += rtt; 1795 1796 fRetransmitTimeout = ((fRoundTripTime / 4 + 1797 fRoundTripDeviation) / 2) * kTimestampFactor; 1798 1799 TRACE(" RTO is now %llu (after rtt %ldms)", fRetransmitTimeout, 1800 roundTripTime); 1801 } 1802 1803 1804 void 1805 TCPEndpoint::_ResetSlowStart() 1806 { 1807 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged) / 2, 1808 2 * fSendMaxSegmentSize); 1809 fCongestionWindow = fSendMaxSegmentSize; 1810 } 1811 1812 1813 void 1814 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1815 { 1816 fDuplicateAcknowledgeCount++; 1817 1818 if (fDuplicateAcknowledgeCount < 3) 1819 return; 1820 else if (fDuplicateAcknowledgeCount == 3) { 1821 _ResetSlowStart(); 1822 fCongestionWindow = fSlowStartThreshold + 3 1823 * fSendMaxSegmentSize; 1824 fSendNext = segment.acknowledge; 1825 } else if (fDuplicateAcknowledgeCount > 3) 1826 fCongestionWindow += fSendMaxSegmentSize; 1827 1828 _SendQueued(); 1829 } 1830 1831 1832 // #pragma mark - timer 1833 1834 1835 /*static*/ void 1836 TCPEndpoint::_RetransmitTimer(net_timer *timer, void *data) 1837 { 1838 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1839 1840 MutexLocker locker(endpoint->fLock); 1841 if (!locker.IsLocked()) 1842 return; 1843 1844 endpoint->_Retransmit(); 1845 } 1846 1847 1848 /*static*/ void 1849 TCPEndpoint::_PersistTimer(net_timer *timer, void *data) 1850 { 1851 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1852 1853 MutexLocker locker(endpoint->fLock); 1854 if (!locker.IsLocked()) 1855 return; 1856 1857 endpoint->_SendQueued(true); 1858 } 1859 1860 1861 /*static*/ void 1862 TCPEndpoint::_DelayedAcknowledgeTimer(struct net_timer *timer, void *data) 1863 { 1864 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1865 1866 MutexLocker locker(endpoint->fLock); 1867 if (!locker.IsLocked()) 1868 return; 1869 1870 endpoint->SendAcknowledge(true); 1871 } 1872 1873 1874 /*static*/ void 1875 TCPEndpoint::_TimeWaitTimer(struct net_timer *timer, void *data) 1876 { 1877 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1878 1879 if (mutex_lock(&endpoint->fLock) < B_OK) 1880 return; 1881 1882 endpoint->DeleteSocket(); 1883 } 1884 1885